osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/mach_param.h>
  86 #include <kern/macro_help.h>
  87 #include <kern/zalloc.h>
  88 #include <kern/misc_protos.h>
  89 #include <kern/policy_internal.h>
  90
  91 #include <vm/vm_compressor.h>
  92 #include <vm/vm_compressor_pager.h>
  93 #include <vm/vm_fault.h>
  94 #include <vm/vm_map.h>
  95 #include <vm/vm_object.h>
  96 #include <vm/vm_page.h>
  97 #include <vm/vm_kern.h>
  98 #include <vm/pmap.h>
  99 #include <vm/vm_pageout.h>
 100 #include <vm/vm_protos.h>
 101 #include <vm/vm_external.h>
 102 #include <vm/memory_object.h>
 103 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 104 #include <vm/vm_shared_region.h>
 105
 106 #include <sys/codesign.h>
 107 #include <sys/reason.h>
 108 #include <sys/signalvar.h>
 109
 110 #include <san/kasan.h>
 111
 112 #define VM_FAULT_CLASSIFY       0
 113
 114 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 115
 116 int vm_protect_privileged_from_untrusted = 1;
 117
 118 unsigned int    vm_object_pagein_throttle = 16;
 119
 120 /*
 121  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 122  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 123  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 124  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 125  * keep the UI active so that the user has a chance to kill the offending task before the system
 126  * completely hangs.
 127  *
 128  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 129  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 130  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 131  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 132  */
 133
 134 extern void throttle_lowpri_io(int);
 135
 136 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
 137
 138 uint64_t vm_hard_throttle_threshold;
 139
 140
 141 OS_ALWAYS_INLINE
 142 boolean_t
 143 NEED_TO_HARD_THROTTLE_THIS_TASK(void)
 144 {
 145         return vm_wants_task_throttled(current_task()) ||
 146                ((vm_page_free_count < vm_page_throttle_limit ||
 147                HARD_THROTTLE_LIMIT_REACHED()) &&
 148                proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
 149 }
 150
 151 #define HARD_THROTTLE_DELAY     10000   /* 10000 us == 10 ms */
 152 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
 153
 154 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
 155 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
 156
 157
 158 #define VM_STAT_DECOMPRESSIONS()        \
 159 MACRO_BEGIN                             \
 160         VM_STAT_INCR(decompressions);       \
 161         current_thread()->decompressions++; \
 162 MACRO_END
 163
 164 boolean_t current_thread_aborted(void);
 165
 166 /* Forward declarations of internal routines. */
 167 static kern_return_t vm_fault_wire_fast(
 168         vm_map_t        map,
 169         vm_map_offset_t va,
 170         vm_prot_t       prot,
 171         vm_tag_t        wire_tag,
 172         vm_map_entry_t  entry,
 173         pmap_t          pmap,
 174         vm_map_offset_t pmap_addr,
 175         ppnum_t         *physpage_p);
 176
 177 static kern_return_t vm_fault_internal(
 178         vm_map_t        map,
 179         vm_map_offset_t vaddr,
 180         vm_prot_t       caller_prot,
 181         boolean_t       change_wiring,
 182         vm_tag_t        wire_tag,
 183         int             interruptible,
 184         pmap_t          pmap,
 185         vm_map_offset_t pmap_addr,
 186         ppnum_t         *physpage_p);
 187
 188 static void vm_fault_copy_cleanup(
 189         vm_page_t       page,
 190         vm_page_t       top_page);
 191
 192 static void vm_fault_copy_dst_cleanup(
 193         vm_page_t       page);
 194
 195 #if     VM_FAULT_CLASSIFY
 196 extern void vm_fault_classify(vm_object_t       object,
 197     vm_object_offset_t    offset,
 198     vm_prot_t             fault_type);
 199
 200 extern void vm_fault_classify_init(void);
 201 #endif
 202
 203 unsigned long vm_pmap_enter_blocked = 0;
 204 unsigned long vm_pmap_enter_retried = 0;
 205
 206 unsigned long vm_cs_validates = 0;
 207 unsigned long vm_cs_revalidates = 0;
 208 unsigned long vm_cs_query_modified = 0;
 209 unsigned long vm_cs_validated_dirtied = 0;
 210 unsigned long vm_cs_bitmap_validated = 0;
 211
 212 void vm_pre_fault(vm_map_offset_t, vm_prot_t);
 213
 214 extern char *kdp_compressor_decompressed_page;
 215 extern addr64_t kdp_compressor_decompressed_page_paddr;
 216 extern ppnum_t  kdp_compressor_decompressed_page_ppnum;
 217
 218 struct vmrtfr {
 219         int vmrtfr_maxi;
 220         int vmrtfr_curi;
 221         int64_t vmrtf_total;
 222         vm_rtfault_record_t *vm_rtf_records;
 223 } vmrtfrs;
 224 #define VMRTF_DEFAULT_BUFSIZE (4096)
 225 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
 226 TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
 227
 228 static void vm_rtfrecord_lock(void);
 229 static void vm_rtfrecord_unlock(void);
 230 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
 231
 232 extern lck_grp_t vm_page_lck_grp_bucket;
 233 extern lck_attr_t vm_page_lck_attr;
 234 LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
 235
 236 /*
 237  *      Routine:        vm_fault_init
 238  *      Purpose:
 239  *              Initialize our private data structures.
 240  */
 241 __startup_func
 242 void
 243 vm_fault_init(void)
 244 {
 245         int i, vm_compressor_temp;
 246         boolean_t need_default_val = TRUE;
 247         /*
 248          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 249          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 250          * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
 251          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 252          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 253          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 254          */
 255
 256         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
 257
 258         /*
 259          * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
 260          */
 261
 262         if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
 263                 for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
 264                         if (((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
 265                                 need_default_val = FALSE;
 266                                 vm_compressor_mode = vm_compressor_temp;
 267                                 break;
 268                         }
 269                 }
 270                 if (need_default_val) {
 271                         printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
 272                 }
 273         }
 274         if (need_default_val) {
 275                 /* If no boot arg or incorrect boot arg, try device tree. */
 276                 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
 277         }
 278         printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
 279
 280         PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
 281             &vm_protect_privileged_from_untrusted,
 282             sizeof(vm_protect_privileged_from_untrusted));
 283 }
 284
 285 __startup_func
 286 static void
 287 vm_rtfault_record_init(void)
 288 {
 289         size_t size;
 290
 291         vmrtf_num_records = MAX(vmrtf_num_records, 1);
 292         size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
 293         vmrtfrs.vm_rtf_records = zalloc_permanent(size,
 294             ZALIGN(vm_rtfault_record_t));
 295         vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
 296 }
 297 STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
 298
 299 /*
 300  *      Routine:        vm_fault_cleanup
 301  *      Purpose:
 302  *              Clean up the result of vm_fault_page.
 303  *      Results:
 304  *              The paging reference for "object" is released.
 305  *              "object" is unlocked.
 306  *              If "top_page" is not null,  "top_page" is
 307  *              freed and the paging reference for the object
 308  *              containing it is released.
 309  *
 310  *      In/out conditions:
 311  *              "object" must be locked.
 312  */
 313 void
 314 vm_fault_cleanup(
 315         vm_object_t     object,
 316         vm_page_t       top_page)
 317 {
 318         vm_object_paging_end(object);
 319         vm_object_unlock(object);
 320
 321         if (top_page != VM_PAGE_NULL) {
 322                 object = VM_PAGE_OBJECT(top_page);
 323
 324                 vm_object_lock(object);
 325                 VM_PAGE_FREE(top_page);
 326                 vm_object_paging_end(object);
 327                 vm_object_unlock(object);
 328         }
 329 }
 330
 331 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 332
 333
 334 boolean_t       vm_page_deactivate_behind = TRUE;
 335 /*
 336  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 337  */
 338 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 339 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 340                                                                 /* we use it to size an array on the stack */
 341
 342 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 343
 344 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 345
 346 /*
 347  * vm_page_is_sequential
 348  *
 349  * Determine if sequential access is in progress
 350  * in accordance with the behavior specified.
 351  * Update state to indicate current access pattern.
 352  *
 353  * object must have at least the shared lock held
 354  */
 355 static
 356 void
 357 vm_fault_is_sequential(
 358         vm_object_t             object,
 359         vm_object_offset_t      offset,
 360         vm_behavior_t           behavior)
 361 {
 362         vm_object_offset_t      last_alloc;
 363         int                     sequential;
 364         int                     orig_sequential;
 365
 366         last_alloc = object->last_alloc;
 367         sequential = object->sequential;
 368         orig_sequential = sequential;
 369
 370         offset = vm_object_trunc_page(offset);
 371         if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
 372                 /* re-faulting in the same page: no change in behavior */
 373                 return;
 374         }
 375
 376         switch (behavior) {
 377         case VM_BEHAVIOR_RANDOM:
 378                 /*
 379                  * reset indicator of sequential behavior
 380                  */
 381                 sequential = 0;
 382                 break;
 383
 384         case VM_BEHAVIOR_SEQUENTIAL:
 385                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 386                         /*
 387                          * advance indicator of sequential behavior
 388                          */
 389                         if (sequential < MAX_SEQUENTIAL_RUN) {
 390                                 sequential += PAGE_SIZE;
 391                         }
 392                 } else {
 393                         /*
 394                          * reset indicator of sequential behavior
 395                          */
 396                         sequential = 0;
 397                 }
 398                 break;
 399
 400         case VM_BEHAVIOR_RSEQNTL:
 401                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 402                         /*
 403                          * advance indicator of sequential behavior
 404                          */
 405                         if (sequential > -MAX_SEQUENTIAL_RUN) {
 406                                 sequential -= PAGE_SIZE;
 407                         }
 408                 } else {
 409                         /*
 410                          * reset indicator of sequential behavior
 411                          */
 412                         sequential = 0;
 413                 }
 414                 break;
 415
 416         case VM_BEHAVIOR_DEFAULT:
 417         default:
 418                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 419                         /*
 420                          * advance indicator of sequential behavior
 421                          */
 422                         if (sequential < 0) {
 423                                 sequential = 0;
 424                         }
 425                         if (sequential < MAX_SEQUENTIAL_RUN) {
 426                                 sequential += PAGE_SIZE;
 427                         }
 428                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 429                         /*
 430                          * advance indicator of sequential behavior
 431                          */
 432                         if (sequential > 0) {
 433                                 sequential = 0;
 434                         }
 435                         if (sequential > -MAX_SEQUENTIAL_RUN) {
 436                                 sequential -= PAGE_SIZE;
 437                         }
 438                 } else {
 439                         /*
 440                          * reset indicator of sequential behavior
 441                          */
 442                         sequential = 0;
 443                 }
 444                 break;
 445         }
 446         if (sequential != orig_sequential) {
 447                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 448                         /*
 449                          * if someone else has already updated object->sequential
 450                          * don't bother trying to update it or object->last_alloc
 451                          */
 452                         return;
 453                 }
 454         }
 455         /*
 456          * I'd like to do this with a OSCompareAndSwap64, but that
 457          * doesn't exist for PPC...  however, it shouldn't matter
 458          * that much... last_alloc is maintained so that we can determine
 459          * if a sequential access pattern is taking place... if only
 460          * one thread is banging on this object, no problem with the unprotected
 461          * update... if 2 or more threads are banging away, we run the risk of
 462          * someone seeing a mangled update... however, in the face of multiple
 463          * accesses, no sequential access pattern can develop anyway, so we
 464          * haven't lost any real info.
 465          */
 466         object->last_alloc = offset;
 467 }
 468
 469
 470 int vm_page_deactivate_behind_count = 0;
 471
 472 /*
 473  * vm_page_deactivate_behind
 474  *
 475  * Determine if sequential access is in progress
 476  * in accordance with the behavior specified.  If
 477  * so, compute a potential page to deactivate and
 478  * deactivate it.
 479  *
 480  * object must be locked.
 481  *
 482  * return TRUE if we actually deactivate a page
 483  */
 484 static
 485 boolean_t
 486 vm_fault_deactivate_behind(
 487         vm_object_t             object,
 488         vm_object_offset_t      offset,
 489         vm_behavior_t           behavior)
 490 {
 491         int             n;
 492         int             pages_in_run = 0;
 493         int             max_pages_in_run = 0;
 494         int             sequential_run;
 495         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 496         vm_object_offset_t      run_offset = 0;
 497         vm_object_offset_t      pg_offset = 0;
 498         vm_page_t       m;
 499         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 500
 501         pages_in_run = 0;
 502 #if TRACEFAULTPAGE
 503         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 504 #endif
 505         if (object == kernel_object || vm_page_deactivate_behind == FALSE || (vm_object_trunc_page(offset) != offset)) {
 506                 /*
 507                  * Do not deactivate pages from the kernel object: they
 508                  * are not intended to become pageable.
 509                  * or we've disabled the deactivate behind mechanism
 510                  * or we are dealing with an offset that is not aligned to
 511                  * the system's PAGE_SIZE because in that case we will
 512                  * handle the deactivation on the aligned offset and, thus,
 513                  * the full PAGE_SIZE page once. This helps us avoid the redundant
 514                  * deactivates and the extra faults.
 515                  */
 516                 return FALSE;
 517         }
 518         if ((sequential_run = object->sequential)) {
 519                 if (sequential_run < 0) {
 520                         sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 521                         sequential_run = 0 - sequential_run;
 522                 } else {
 523                         sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 524                 }
 525         }
 526         switch (behavior) {
 527         case VM_BEHAVIOR_RANDOM:
 528                 break;
 529         case VM_BEHAVIOR_SEQUENTIAL:
 530                 if (sequential_run >= (int)PAGE_SIZE) {
 531                         run_offset = 0 - PAGE_SIZE_64;
 532                         max_pages_in_run = 1;
 533                 }
 534                 break;
 535         case VM_BEHAVIOR_RSEQNTL:
 536                 if (sequential_run >= (int)PAGE_SIZE) {
 537                         run_offset = PAGE_SIZE_64;
 538                         max_pages_in_run = 1;
 539                 }
 540                 break;
 541         case VM_BEHAVIOR_DEFAULT:
 542         default:
 543         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 544
 545                 /*
 546                  * determine if the run of sequential accesss has been
 547                  * long enough on an object with default access behavior
 548                  * to consider it for deactivation
 549                  */
 550                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 551                         /*
 552                          * the comparisons between offset and behind are done
 553                          * in this kind of odd fashion in order to prevent wrap around
 554                          * at the end points
 555                          */
 556                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 557                                 if (offset >= behind) {
 558                                         run_offset = 0 - behind;
 559                                         pg_offset = PAGE_SIZE_64;
 560                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 561                                 }
 562                         } else {
 563                                 if (offset < -behind) {
 564                                         run_offset = behind;
 565                                         pg_offset = 0 - PAGE_SIZE_64;
 566                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 567                                 }
 568                         }
 569                 }
 570                 break;}
 571         }
 572         for (n = 0; n < max_pages_in_run; n++) {
 573                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 574
 575                 if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
 576                         page_run[pages_in_run++] = m;
 577
 578                         /*
 579                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
 580                          *
 581                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
 582                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
 583                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
 584                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
 585                          * by pageout_scan, which is just fine since the last reference would have happened quite far
 586                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
 587                          * have happened before we did the deactivate_behind.
 588                          */
 589                         pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
 590                 }
 591         }
 592         if (pages_in_run) {
 593                 vm_page_lockspin_queues();
 594
 595                 for (n = 0; n < pages_in_run; n++) {
 596                         m = page_run[n];
 597
 598                         vm_page_deactivate_internal(m, FALSE);
 599
 600                         vm_page_deactivate_behind_count++;
 601 #if TRACEFAULTPAGE
 602                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 603 #endif
 604                 }
 605                 vm_page_unlock_queues();
 606
 607                 return TRUE;
 608         }
 609         return FALSE;
 610 }
 611
 612
 613 #if (DEVELOPMENT || DEBUG)
 614 uint32_t        vm_page_creation_throttled_hard = 0;
 615 uint32_t        vm_page_creation_throttled_soft = 0;
 616 uint64_t        vm_page_creation_throttle_avoided = 0;
 617 #endif /* DEVELOPMENT || DEBUG */
 618
 619 static int
 620 vm_page_throttled(boolean_t page_kept)
 621 {
 622         clock_sec_t     elapsed_sec;
 623         clock_sec_t     tv_sec;
 624         clock_usec_t    tv_usec;
 625
 626         thread_t thread = current_thread();
 627
 628         if (thread->options & TH_OPT_VMPRIV) {
 629                 return 0;
 630         }
 631
 632         if (thread->t_page_creation_throttled) {
 633                 thread->t_page_creation_throttled = 0;
 634
 635                 if (page_kept == FALSE) {
 636                         goto no_throttle;
 637                 }
 638         }
 639         if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
 640 #if (DEVELOPMENT || DEBUG)
 641                 thread->t_page_creation_throttled_hard++;
 642                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 643 #endif /* DEVELOPMENT || DEBUG */
 644                 return HARD_THROTTLE_DELAY;
 645         }
 646
 647         if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
 648             thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
 649                 if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
 650 #if (DEVELOPMENT || DEBUG)
 651                         OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
 652 #endif
 653                         goto no_throttle;
 654                 }
 655                 clock_get_system_microtime(&tv_sec, &tv_usec);
 656
 657                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 658
 659                 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
 660                     (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
 661                         if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
 662                                 /*
 663                                  * we'll reset our stats to give a well behaved app
 664                                  * that was unlucky enough to accumulate a bunch of pages
 665                                  * over a long period of time a chance to get out of
 666                                  * the throttled state... we reset the counter and timestamp
 667                                  * so that if it stays under the rate limit for the next second
 668                                  * it will be back in our good graces... if it exceeds it, it
 669                                  * will remain in the throttled state
 670                                  */
 671                                 thread->t_page_creation_time = tv_sec;
 672                                 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
 673                         }
 674                         VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
 675
 676                         thread->t_page_creation_throttled = 1;
 677
 678                         if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
 679 #if (DEVELOPMENT || DEBUG)
 680                                 thread->t_page_creation_throttled_hard++;
 681                                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 682 #endif /* DEVELOPMENT || DEBUG */
 683                                 return HARD_THROTTLE_DELAY;
 684                         } else {
 685 #if (DEVELOPMENT || DEBUG)
 686                                 thread->t_page_creation_throttled_soft++;
 687                                 OSAddAtomic(1, &vm_page_creation_throttled_soft);
 688 #endif /* DEVELOPMENT || DEBUG */
 689                                 return SOFT_THROTTLE_DELAY;
 690                         }
 691                 }
 692                 thread->t_page_creation_time = tv_sec;
 693                 thread->t_page_creation_count = 0;
 694         }
 695 no_throttle:
 696         thread->t_page_creation_count++;
 697
 698         return 0;
 699 }
 700
 701
 702 /*
 703  * check for various conditions that would
 704  * prevent us from creating a ZF page...
 705  * cleanup is based on being called from vm_fault_page
 706  *
 707  * object must be locked
 708  * object == m->vmp_object
 709  */
 710 static vm_fault_return_t
 711 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
 712 {
 713         int throttle_delay;
 714
 715         if (object->shadow_severed ||
 716             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 717                 /*
 718                  * Either:
 719                  * 1. the shadow chain was severed,
 720                  * 2. the purgeable object is volatile or empty and is marked
 721                  *    to fault on access while volatile.
 722                  * Just have to return an error at this point
 723                  */
 724                 if (m != VM_PAGE_NULL) {
 725                         VM_PAGE_FREE(m);
 726                 }
 727                 vm_fault_cleanup(object, first_m);
 728
 729                 thread_interrupt_level(interruptible_state);
 730
 731                 return VM_FAULT_MEMORY_ERROR;
 732         }
 733         if (page_throttle == TRUE) {
 734                 if ((throttle_delay = vm_page_throttled(FALSE))) {
 735                         /*
 736                          * we're throttling zero-fills...
 737                          * treat this as if we couldn't grab a page
 738                          */
 739                         if (m != VM_PAGE_NULL) {
 740                                 VM_PAGE_FREE(m);
 741                         }
 742                         vm_fault_cleanup(object, first_m);
 743
 744                         VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 745
 746                         delay(throttle_delay);
 747
 748                         if (current_thread_aborted()) {
 749                                 thread_interrupt_level(interruptible_state);
 750                                 return VM_FAULT_INTERRUPTED;
 751                         }
 752                         thread_interrupt_level(interruptible_state);
 753
 754                         return VM_FAULT_MEMORY_SHORTAGE;
 755                 }
 756         }
 757         return VM_FAULT_SUCCESS;
 758 }
 759
 760 /*
 761  * Clear the code signing bits on the given page_t
 762  */
 763 static void
 764 vm_fault_cs_clear(vm_page_t m)
 765 {
 766         m->vmp_cs_validated = VMP_CS_ALL_FALSE;
 767         m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
 768         m->vmp_cs_nx = VMP_CS_ALL_FALSE;
 769 }
 770
 771 /*
 772  * Enqueues the given page on the throttled queue.
 773  * The caller must hold the vm_page_queue_lock and it will be held on return.
 774  */
 775 static void
 776 vm_fault_enqueue_throttled_locked(vm_page_t m)
 777 {
 778         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 779         assert(!VM_PAGE_WIRED(m));
 780
 781         /*
 782          * can't be on the pageout queue since we don't
 783          * have a pager to try and clean to
 784          */
 785         vm_page_queues_remove(m, TRUE);
 786         vm_page_check_pageable_safe(m);
 787         vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
 788         m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
 789         vm_page_throttled_count++;
 790 }
 791
 792 /*
 793  * do the work to zero fill a page and
 794  * inject it into the correct paging queue
 795  *
 796  * m->vmp_object must be locked
 797  * page queue lock must NOT be held
 798  */
 799 static int
 800 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 801 {
 802         int my_fault = DBG_ZERO_FILL_FAULT;
 803         vm_object_t     object;
 804
 805         object = VM_PAGE_OBJECT(m);
 806
 807         /*
 808          * This is is a zero-fill page fault...
 809          *
 810          * Checking the page lock is a waste of
 811          * time;  this page was absent, so
 812          * it can't be page locked by a pager.
 813          *
 814          * we also consider it undefined
 815          * with respect to instruction
 816          * execution.  i.e. it is the responsibility
 817          * of higher layers to call for an instruction
 818          * sync after changing the contents and before
 819          * sending a program into this area.  We
 820          * choose this approach for performance
 821          */
 822         vm_fault_cs_clear(m);
 823         m->vmp_pmapped = TRUE;
 824
 825         if (no_zero_fill == TRUE) {
 826                 my_fault = DBG_NZF_PAGE_FAULT;
 827
 828                 if (m->vmp_absent && m->vmp_busy) {
 829                         return my_fault;
 830                 }
 831         } else {
 832                 vm_page_zero_fill(m);
 833
 834                 VM_STAT_INCR(zero_fill_count);
 835                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 836         }
 837         assert(!m->vmp_laundry);
 838         assert(object != kernel_object);
 839         //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
 840         if (!VM_DYNAMIC_PAGING_ENABLED() &&
 841             (object->purgable == VM_PURGABLE_DENY ||
 842             object->purgable == VM_PURGABLE_NONVOLATILE ||
 843             object->purgable == VM_PURGABLE_VOLATILE)) {
 844                 vm_page_lockspin_queues();
 845                 if (!VM_DYNAMIC_PAGING_ENABLED()) {
 846                         vm_fault_enqueue_throttled_locked(m);
 847                 }
 848                 vm_page_unlock_queues();
 849         }
 850         return my_fault;
 851 }
 852
 853
 854 /*
 855  *      Routine:        vm_fault_page
 856  *      Purpose:
 857  *              Find the resident page for the virtual memory
 858  *              specified by the given virtual memory object
 859  *              and offset.
 860  *      Additional arguments:
 861  *              The required permissions for the page is given
 862  *              in "fault_type".  Desired permissions are included
 863  *              in "protection".
 864  *              fault_info is passed along to determine pagein cluster
 865  *              limits... it contains the expected reference pattern,
 866  *              cluster size if available, etc...
 867  *
 868  *              If the desired page is known to be resident (for
 869  *              example, because it was previously wired down), asserting
 870  *              the "unwiring" parameter will speed the search.
 871  *
 872  *              If the operation can be interrupted (by thread_abort
 873  *              or thread_terminate), then the "interruptible"
 874  *              parameter should be asserted.
 875  *
 876  *      Results:
 877  *              The page containing the proper data is returned
 878  *              in "result_page".
 879  *
 880  *      In/out conditions:
 881  *              The source object must be locked and referenced,
 882  *              and must donate one paging reference.  The reference
 883  *              is not affected.  The paging reference and lock are
 884  *              consumed.
 885  *
 886  *              If the call succeeds, the object in which "result_page"
 887  *              resides is left locked and holding a paging reference.
 888  *              If this is not the original object, a busy page in the
 889  *              original object is returned in "top_page", to prevent other
 890  *              callers from pursuing this same data, along with a paging
 891  *              reference for the original object.  The "top_page" should
 892  *              be destroyed when this guarantee is no longer required.
 893  *              The "result_page" is also left busy.  It is not removed
 894  *              from the pageout queues.
 895  *      Special Case:
 896  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 897  *              fault succeeded but there's no VM page (i.e. the VM object
 898  *              does not actually hold VM pages, but device memory or
 899  *              large pages).  The object is still locked and we still hold a
 900  *              paging_in_progress reference.
 901  */
 902 unsigned int vm_fault_page_blocked_access = 0;
 903 unsigned int vm_fault_page_forced_retry = 0;
 904
 905 vm_fault_return_t
 906 vm_fault_page(
 907         /* Arguments: */
 908         vm_object_t     first_object,   /* Object to begin search */
 909         vm_object_offset_t first_offset,        /* Offset into object */
 910         vm_prot_t       fault_type,     /* What access is requested */
 911         boolean_t       must_be_resident,/* Must page be resident? */
 912         boolean_t       caller_lookup,  /* caller looked up page */
 913         /* Modifies in place: */
 914         vm_prot_t       *protection,    /* Protection for mapping */
 915         vm_page_t       *result_page,   /* Page found, if successful */
 916         /* Returns: */
 917         vm_page_t       *top_page,      /* Page in top object, if
 918                                          * not result_page.  */
 919         int             *type_of_fault, /* if non-null, fill in with type of fault
 920                                          * COW, zero-fill, etc... returned in trace point */
 921         /* More arguments: */
 922         kern_return_t   *error_code,    /* code if page is in error */
 923         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 924         boolean_t       data_supply,    /* treat as data_supply if
 925                                          * it is a write fault and a full
 926                                          * page is provided */
 927         vm_object_fault_info_t fault_info)
 928 {
 929         vm_page_t               m;
 930         vm_object_t             object;
 931         vm_object_offset_t      offset;
 932         vm_page_t               first_m;
 933         vm_object_t             next_object;
 934         vm_object_t             copy_object;
 935         boolean_t               look_for_page;
 936         boolean_t               force_fault_retry = FALSE;
 937         vm_prot_t               access_required = fault_type;
 938         vm_prot_t               wants_copy_flag;
 939         kern_return_t           wait_result;
 940         wait_interrupt_t        interruptible_state;
 941         boolean_t               data_already_requested = FALSE;
 942         vm_behavior_t           orig_behavior;
 943         vm_size_t               orig_cluster_size;
 944         vm_fault_return_t       error;
 945         int                     my_fault;
 946         uint32_t                try_failed_count;
 947         int                     interruptible; /* how may fault be interrupted? */
 948         int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
 949         memory_object_t         pager;
 950         vm_fault_return_t       retval;
 951         int                     grab_options;
 952
 953 /*
 954  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 955  * marked as paged out in the compressor pager or the pager doesn't exist.
 956  * Note also that if the pager for an internal object
 957  * has not been created, the pager is not invoked regardless of the value
 958  * of MUST_ASK_PAGER().
 959  *
 960  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 961  * is marked as paged out in the compressor pager.
 962  * PAGED_OUT() is used to determine if a page has already been pushed
 963  * into a copy object in order to avoid a redundant page out operation.
 964  */
 965 #define MUST_ASK_PAGER(o, f, s)                                 \
 966         ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
 967
 968 #define PAGED_OUT(o, f) \
 969         (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
 970
 971 /*
 972  *      Recovery actions
 973  */
 974 #define RELEASE_PAGE(m)                                 \
 975         MACRO_BEGIN                                     \
 976         PAGE_WAKEUP_DONE(m);                            \
 977         if ( !VM_PAGE_PAGEABLE(m)) {                    \
 978                 vm_page_lockspin_queues();              \
 979                 if ( !VM_PAGE_PAGEABLE(m)) {            \
 980                         if (VM_CONFIG_COMPRESSOR_IS_ACTIVE)     \
 981                                 vm_page_deactivate(m);          \
 982                         else                                    \
 983                                 vm_page_activate(m);            \
 984                 }                                               \
 985                 vm_page_unlock_queues();                        \
 986         }                                                       \
 987         MACRO_END
 988
 989 #if TRACEFAULTPAGE
 990         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 991 #endif
 992
 993         interruptible = fault_info->interruptible;
 994         interruptible_state = thread_interrupt_level(interruptible);
 995
 996         /*
 997          *      INVARIANTS (through entire routine):
 998          *
 999          *      1)      At all times, we must either have the object
1000          *              lock or a busy page in some object to prevent
1001          *              some other thread from trying to bring in
1002          *              the same page.
1003          *
1004          *              Note that we cannot hold any locks during the
1005          *              pager access or when waiting for memory, so
1006          *              we use a busy page then.
1007          *
1008          *      2)      To prevent another thread from racing us down the
1009          *              shadow chain and entering a new page in the top
1010          *              object before we do, we must keep a busy page in
1011          *              the top object while following the shadow chain.
1012          *
1013          *      3)      We must increment paging_in_progress on any object
1014          *              for which we have a busy page before dropping
1015          *              the object lock
1016          *
1017          *      4)      We leave busy pages on the pageout queues.
1018          *              If the pageout daemon comes across a busy page,
1019          *              it will remove the page from the pageout queues.
1020          */
1021
1022         object = first_object;
1023         offset = first_offset;
1024         first_m = VM_PAGE_NULL;
1025         access_required = fault_type;
1026
1027         /*
1028          * default type of fault
1029          */
1030         my_fault = DBG_CACHE_HIT_FAULT;
1031
1032         while (TRUE) {
1033 #if TRACEFAULTPAGE
1034                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1035 #endif
1036
1037                 grab_options = 0;
1038 #if CONFIG_SECLUDED_MEMORY
1039                 if (object->can_grab_secluded) {
1040                         grab_options |= VM_PAGE_GRAB_SECLUDED;
1041                 }
1042 #endif /* CONFIG_SECLUDED_MEMORY */
1043
1044                 if (!object->alive) {
1045                         /*
1046                          * object is no longer valid
1047                          * clean up and return error
1048                          */
1049                         vm_fault_cleanup(object, first_m);
1050                         thread_interrupt_level(interruptible_state);
1051
1052                         return VM_FAULT_MEMORY_ERROR;
1053                 }
1054
1055                 if (!object->pager_created && object->phys_contiguous) {
1056                         /*
1057                          * A physically-contiguous object without a pager:
1058                          * must be a "large page" object.  We do not deal
1059                          * with VM pages for this object.
1060                          */
1061                         caller_lookup = FALSE;
1062                         m = VM_PAGE_NULL;
1063                         goto phys_contig_object;
1064                 }
1065
1066                 if (object->blocked_access) {
1067                         /*
1068                          * Access to this VM object has been blocked.
1069                          * Replace our "paging_in_progress" reference with
1070                          * a "activity_in_progress" reference and wait for
1071                          * access to be unblocked.
1072                          */
1073                         caller_lookup = FALSE; /* no longer valid after sleep */
1074                         vm_object_activity_begin(object);
1075                         vm_object_paging_end(object);
1076                         while (object->blocked_access) {
1077                                 vm_object_sleep(object,
1078                                     VM_OBJECT_EVENT_UNBLOCKED,
1079                                     THREAD_UNINT);
1080                         }
1081                         vm_fault_page_blocked_access++;
1082                         vm_object_paging_begin(object);
1083                         vm_object_activity_end(object);
1084                 }
1085
1086                 /*
1087                  * See whether the page at 'offset' is resident
1088                  */
1089                 if (caller_lookup == TRUE) {
1090                         /*
1091                          * The caller has already looked up the page
1092                          * and gave us the result in "result_page".
1093                          * We can use this for the first lookup but
1094                          * it loses its validity as soon as we unlock
1095                          * the object.
1096                          */
1097                         m = *result_page;
1098                         caller_lookup = FALSE; /* no longer valid after that */
1099                 } else {
1100                         m = vm_page_lookup(object, vm_object_trunc_page(offset));
1101                 }
1102 #if TRACEFAULTPAGE
1103                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1104 #endif
1105                 if (m != VM_PAGE_NULL) {
1106                         if (m->vmp_busy) {
1107                                 /*
1108                                  * The page is being brought in,
1109                                  * wait for it and then retry.
1110                                  */
1111 #if TRACEFAULTPAGE
1112                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1113 #endif
1114                                 wait_result = PAGE_SLEEP(object, m, interruptible);
1115
1116                                 counter(c_vm_fault_page_block_busy_kernel++);
1117
1118                                 if (wait_result != THREAD_AWAKENED) {
1119                                         vm_fault_cleanup(object, first_m);
1120                                         thread_interrupt_level(interruptible_state);
1121
1122                                         if (wait_result == THREAD_RESTART) {
1123                                                 return VM_FAULT_RETRY;
1124                                         } else {
1125                                                 return VM_FAULT_INTERRUPTED;
1126                                         }
1127                                 }
1128                                 continue;
1129                         }
1130                         if (m->vmp_laundry) {
1131                                 m->vmp_free_when_done = FALSE;
1132
1133                                 if (!m->vmp_cleaning) {
1134                                         vm_pageout_steal_laundry(m, FALSE);
1135                                 }
1136                         }
1137                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1138                                 /*
1139                                  * Guard page: off limits !
1140                                  */
1141                                 if (fault_type == VM_PROT_NONE) {
1142                                         /*
1143                                          * The fault is not requesting any
1144                                          * access to the guard page, so it must
1145                                          * be just to wire or unwire it.
1146                                          * Let's pretend it succeeded...
1147                                          */
1148                                         m->vmp_busy = TRUE;
1149                                         *result_page = m;
1150                                         assert(first_m == VM_PAGE_NULL);
1151                                         *top_page = first_m;
1152                                         if (type_of_fault) {
1153                                                 *type_of_fault = DBG_GUARD_FAULT;
1154                                         }
1155                                         thread_interrupt_level(interruptible_state);
1156                                         return VM_FAULT_SUCCESS;
1157                                 } else {
1158                                         /*
1159                                          * The fault requests access to the
1160                                          * guard page: let's deny that !
1161                                          */
1162                                         vm_fault_cleanup(object, first_m);
1163                                         thread_interrupt_level(interruptible_state);
1164                                         return VM_FAULT_MEMORY_ERROR;
1165                                 }
1166                         }
1167
1168                         if (m->vmp_error) {
1169                                 /*
1170                                  * The page is in error, give up now.
1171                                  */
1172 #if TRACEFAULTPAGE
1173                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1174 #endif
1175                                 if (error_code) {
1176                                         *error_code = KERN_MEMORY_ERROR;
1177                                 }
1178                                 VM_PAGE_FREE(m);
1179
1180                                 vm_fault_cleanup(object, first_m);
1181                                 thread_interrupt_level(interruptible_state);
1182
1183                                 return VM_FAULT_MEMORY_ERROR;
1184                         }
1185                         if (m->vmp_restart) {
1186                                 /*
1187                                  * The pager wants us to restart
1188                                  * at the top of the chain,
1189                                  * typically because it has moved the
1190                                  * page to another pager, then do so.
1191                                  */
1192 #if TRACEFAULTPAGE
1193                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1194 #endif
1195                                 VM_PAGE_FREE(m);
1196
1197                                 vm_fault_cleanup(object, first_m);
1198                                 thread_interrupt_level(interruptible_state);
1199
1200                                 return VM_FAULT_RETRY;
1201                         }
1202                         if (m->vmp_absent) {
1203                                 /*
1204                                  * The page isn't busy, but is absent,
1205                                  * therefore it's deemed "unavailable".
1206                                  *
1207                                  * Remove the non-existent page (unless it's
1208                                  * in the top object) and move on down to the
1209                                  * next object (if there is one).
1210                                  */
1211 #if TRACEFAULTPAGE
1212                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1213 #endif
1214                                 next_object = object->shadow;
1215
1216                                 if (next_object == VM_OBJECT_NULL) {
1217                                         /*
1218                                          * Absent page at bottom of shadow
1219                                          * chain; zero fill the page we left
1220                                          * busy in the first object, and free
1221                                          * the absent page.
1222                                          */
1223                                         assert(!must_be_resident);
1224
1225                                         /*
1226                                          * check for any conditions that prevent
1227                                          * us from creating a new zero-fill page
1228                                          * vm_fault_check will do all of the
1229                                          * fault cleanup in the case of an error condition
1230                                          * including resetting the thread_interrupt_level
1231                                          */
1232                                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1233
1234                                         if (error != VM_FAULT_SUCCESS) {
1235                                                 return error;
1236                                         }
1237
1238                                         if (object != first_object) {
1239                                                 /*
1240                                                  * free the absent page we just found
1241                                                  */
1242                                                 VM_PAGE_FREE(m);
1243
1244                                                 /*
1245                                                  * drop reference and lock on current object
1246                                                  */
1247                                                 vm_object_paging_end(object);
1248                                                 vm_object_unlock(object);
1249
1250                                                 /*
1251                                                  * grab the original page we
1252                                                  * 'soldered' in place and
1253                                                  * retake lock on 'first_object'
1254                                                  */
1255                                                 m = first_m;
1256                                                 first_m = VM_PAGE_NULL;
1257
1258                                                 object = first_object;
1259                                                 offset = first_offset;
1260
1261                                                 vm_object_lock(object);
1262                                         } else {
1263                                                 /*
1264                                                  * we're going to use the absent page we just found
1265                                                  * so convert it to a 'busy' page
1266                                                  */
1267                                                 m->vmp_absent = FALSE;
1268                                                 m->vmp_busy = TRUE;
1269                                         }
1270                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1271                                                 m->vmp_absent = TRUE;
1272                                         }
1273                                         /*
1274                                          * zero-fill the page and put it on
1275                                          * the correct paging queue
1276                                          */
1277                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1278
1279                                         break;
1280                                 } else {
1281                                         if (must_be_resident) {
1282                                                 vm_object_paging_end(object);
1283                                         } else if (object != first_object) {
1284                                                 vm_object_paging_end(object);
1285                                                 VM_PAGE_FREE(m);
1286                                         } else {
1287                                                 first_m = m;
1288                                                 m->vmp_absent = FALSE;
1289                                                 m->vmp_busy = TRUE;
1290
1291                                                 vm_page_lockspin_queues();
1292                                                 vm_page_queues_remove(m, FALSE);
1293                                                 vm_page_unlock_queues();
1294                                         }
1295
1296                                         offset += object->vo_shadow_offset;
1297                                         fault_info->lo_offset += object->vo_shadow_offset;
1298                                         fault_info->hi_offset += object->vo_shadow_offset;
1299                                         access_required = VM_PROT_READ;
1300
1301                                         vm_object_lock(next_object);
1302                                         vm_object_unlock(object);
1303                                         object = next_object;
1304                                         vm_object_paging_begin(object);
1305
1306                                         /*
1307                                          * reset to default type of fault
1308                                          */
1309                                         my_fault = DBG_CACHE_HIT_FAULT;
1310
1311                                         continue;
1312                                 }
1313                         }
1314                         if ((m->vmp_cleaning)
1315                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1316                             && (fault_type & VM_PROT_WRITE)) {
1317                                 /*
1318                                  * This is a copy-on-write fault that will
1319                                  * cause us to revoke access to this page, but
1320                                  * this page is in the process of being cleaned
1321                                  * in a clustered pageout. We must wait until
1322                                  * the cleaning operation completes before
1323                                  * revoking access to the original page,
1324                                  * otherwise we might attempt to remove a
1325                                  * wired mapping.
1326                                  */
1327 #if TRACEFAULTPAGE
1328                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1329 #endif
1330                                 /*
1331                                  * take an extra ref so that object won't die
1332                                  */
1333                                 vm_object_reference_locked(object);
1334
1335                                 vm_fault_cleanup(object, first_m);
1336
1337                                 counter(c_vm_fault_page_block_backoff_kernel++);
1338                                 vm_object_lock(object);
1339                                 assert(object->ref_count > 0);
1340
1341                                 m = vm_page_lookup(object, vm_object_trunc_page(offset));
1342
1343                                 if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1344                                         PAGE_ASSERT_WAIT(m, interruptible);
1345
1346                                         vm_object_unlock(object);
1347                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1348                                         vm_object_deallocate(object);
1349
1350                                         goto backoff;
1351                                 } else {
1352                                         vm_object_unlock(object);
1353
1354                                         vm_object_deallocate(object);
1355                                         thread_interrupt_level(interruptible_state);
1356
1357                                         return VM_FAULT_RETRY;
1358                                 }
1359                         }
1360                         if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1361                             !(fault_info != NULL && fault_info->stealth)) {
1362                                 /*
1363                                  * If we were passed a non-NULL pointer for
1364                                  * "type_of_fault", than we came from
1365                                  * vm_fault... we'll let it deal with
1366                                  * this condition, since it
1367                                  * needs to see m->vmp_speculative to correctly
1368                                  * account the pageins, otherwise...
1369                                  * take it off the speculative queue, we'll
1370                                  * let the caller of vm_fault_page deal
1371                                  * with getting it onto the correct queue
1372                                  *
1373                                  * If the caller specified in fault_info that
1374                                  * it wants a "stealth" fault, we also leave
1375                                  * the page in the speculative queue.
1376                                  */
1377                                 vm_page_lockspin_queues();
1378                                 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1379                                         vm_page_queues_remove(m, FALSE);
1380                                 }
1381                                 vm_page_unlock_queues();
1382                         }
1383                         assert(object == VM_PAGE_OBJECT(m));
1384
1385                         if (object->code_signed) {
1386                                 /*
1387                                  * CODE SIGNING:
1388                                  * We just paged in a page from a signed
1389                                  * memory object but we don't need to
1390                                  * validate it now.  We'll validate it if
1391                                  * when it gets mapped into a user address
1392                                  * space for the first time or when the page
1393                                  * gets copied to another object as a result
1394                                  * of a copy-on-write.
1395                                  */
1396                         }
1397
1398                         /*
1399                          * We mark the page busy and leave it on
1400                          * the pageout queues.  If the pageout
1401                          * deamon comes across it, then it will
1402                          * remove the page from the queue, but not the object
1403                          */
1404 #if TRACEFAULTPAGE
1405                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1406 #endif
1407                         assert(!m->vmp_busy);
1408                         assert(!m->vmp_absent);
1409
1410                         m->vmp_busy = TRUE;
1411                         break;
1412                 }
1413
1414
1415                 /*
1416                  * we get here when there is no page present in the object at
1417                  * the offset we're interested in... we'll allocate a page
1418                  * at this point if the pager associated with
1419                  * this object can provide the data or we're the top object...
1420                  * object is locked;  m == NULL
1421                  */
1422
1423                 if (must_be_resident) {
1424                         if (fault_type == VM_PROT_NONE &&
1425                             object == kernel_object) {
1426                                 /*
1427                                  * We've been called from vm_fault_unwire()
1428                                  * while removing a map entry that was allocated
1429                                  * with KMA_KOBJECT and KMA_VAONLY.  This page
1430                                  * is not present and there's nothing more to
1431                                  * do here (nothing to unwire).
1432                                  */
1433                                 vm_fault_cleanup(object, first_m);
1434                                 thread_interrupt_level(interruptible_state);
1435
1436                                 return VM_FAULT_MEMORY_ERROR;
1437                         }
1438
1439                         goto dont_look_for_page;
1440                 }
1441
1442                 /* Don't expect to fault pages into the kernel object. */
1443                 assert(object != kernel_object);
1444
1445                 data_supply = FALSE;
1446
1447                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1448
1449 #if TRACEFAULTPAGE
1450                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1451 #endif
1452                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1453                         /*
1454                          * Allocate a new page for this object/offset pair as a placeholder
1455                          */
1456                         m = vm_page_grab_options(grab_options);
1457 #if TRACEFAULTPAGE
1458                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1459 #endif
1460                         if (m == VM_PAGE_NULL) {
1461                                 vm_fault_cleanup(object, first_m);
1462                                 thread_interrupt_level(interruptible_state);
1463
1464                                 return VM_FAULT_MEMORY_SHORTAGE;
1465                         }
1466
1467                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1468                                 vm_page_insert_internal(m, object,
1469                                     vm_object_trunc_page(offset),
1470                                     VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1471                         } else {
1472                                 vm_page_insert(m, object, vm_object_trunc_page(offset));
1473                         }
1474                 }
1475                 if (look_for_page) {
1476                         kern_return_t   rc;
1477                         int             my_fault_type;
1478
1479                         /*
1480                          *      If the memory manager is not ready, we
1481                          *      cannot make requests.
1482                          */
1483                         if (!object->pager_ready) {
1484 #if TRACEFAULTPAGE
1485                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1486 #endif
1487                                 if (m != VM_PAGE_NULL) {
1488                                         VM_PAGE_FREE(m);
1489                                 }
1490
1491                                 /*
1492                                  * take an extra ref so object won't die
1493                                  */
1494                                 vm_object_reference_locked(object);
1495                                 vm_fault_cleanup(object, first_m);
1496                                 counter(c_vm_fault_page_block_backoff_kernel++);
1497
1498                                 vm_object_lock(object);
1499                                 assert(object->ref_count > 0);
1500
1501                                 if (!object->pager_ready) {
1502                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1503
1504                                         vm_object_unlock(object);
1505                                         if (wait_result == THREAD_WAITING) {
1506                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1507                                         }
1508                                         vm_object_deallocate(object);
1509
1510                                         goto backoff;
1511                                 } else {
1512                                         vm_object_unlock(object);
1513                                         vm_object_deallocate(object);
1514                                         thread_interrupt_level(interruptible_state);
1515
1516                                         return VM_FAULT_RETRY;
1517                                 }
1518                         }
1519                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1520                                 /*
1521                                  * If there are too many outstanding page
1522                                  * requests pending on this external object, we
1523                                  * wait for them to be resolved now.
1524                                  */
1525 #if TRACEFAULTPAGE
1526                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1527 #endif
1528                                 if (m != VM_PAGE_NULL) {
1529                                         VM_PAGE_FREE(m);
1530                                 }
1531                                 /*
1532                                  * take an extra ref so object won't die
1533                                  */
1534                                 vm_object_reference_locked(object);
1535
1536                                 vm_fault_cleanup(object, first_m);
1537
1538                                 counter(c_vm_fault_page_block_backoff_kernel++);
1539
1540                                 vm_object_lock(object);
1541                                 assert(object->ref_count > 0);
1542
1543                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1544                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1545
1546                                         vm_object_unlock(object);
1547                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1548                                         vm_object_deallocate(object);
1549
1550                                         goto backoff;
1551                                 } else {
1552                                         vm_object_unlock(object);
1553                                         vm_object_deallocate(object);
1554                                         thread_interrupt_level(interruptible_state);
1555
1556                                         return VM_FAULT_RETRY;
1557                                 }
1558                         }
1559                         if (object->internal) {
1560                                 int compressed_count_delta;
1561
1562                                 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1563
1564                                 if (m == VM_PAGE_NULL) {
1565                                         /*
1566                                          * Allocate a new page for this object/offset pair as a placeholder
1567                                          */
1568                                         m = vm_page_grab_options(grab_options);
1569 #if TRACEFAULTPAGE
1570                                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1571 #endif
1572                                         if (m == VM_PAGE_NULL) {
1573                                                 vm_fault_cleanup(object, first_m);
1574                                                 thread_interrupt_level(interruptible_state);
1575
1576                                                 return VM_FAULT_MEMORY_SHORTAGE;
1577                                         }
1578
1579                                         m->vmp_absent = TRUE;
1580                                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1581                                                 vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1582                                         } else {
1583                                                 vm_page_insert(m, object, vm_object_trunc_page(offset));
1584                                         }
1585                                 }
1586                                 assert(m->vmp_busy);
1587
1588                                 m->vmp_absent = TRUE;
1589                                 pager = object->pager;
1590
1591                                 assert(object->paging_in_progress > 0);
1592                                 vm_object_unlock(object);
1593
1594                                 rc = vm_compressor_pager_get(
1595                                         pager,
1596                                         offset + object->paging_offset,
1597                                         VM_PAGE_GET_PHYS_PAGE(m),
1598                                         &my_fault_type,
1599                                         0,
1600                                         &compressed_count_delta);
1601
1602                                 if (type_of_fault == NULL) {
1603                                         int     throttle_delay;
1604
1605                                         /*
1606                                          * we weren't called from vm_fault, so we
1607                                          * need to apply page creation throttling
1608                                          * do it before we re-acquire any locks
1609                                          */
1610                                         if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1611                                                 if ((throttle_delay = vm_page_throttled(TRUE))) {
1612                                                         VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1613                                                         delay(throttle_delay);
1614                                                 }
1615                                         }
1616                                 }
1617                                 vm_object_lock(object);
1618                                 assert(object->paging_in_progress > 0);
1619
1620                                 vm_compressor_pager_count(
1621                                         pager,
1622                                         compressed_count_delta,
1623                                         FALSE, /* shared_lock */
1624                                         object);
1625
1626                                 switch (rc) {
1627                                 case KERN_SUCCESS:
1628                                         m->vmp_absent = FALSE;
1629                                         m->vmp_dirty = TRUE;
1630                                         if ((object->wimg_bits &
1631                                             VM_WIMG_MASK) !=
1632                                             VM_WIMG_USE_DEFAULT) {
1633                                                 /*
1634                                                  * If the page is not cacheable,
1635                                                  * we can't let its contents
1636                                                  * linger in the data cache
1637                                                  * after the decompression.
1638                                                  */
1639                                                 pmap_sync_page_attributes_phys(
1640                                                         VM_PAGE_GET_PHYS_PAGE(m));
1641                                         } else {
1642                                                 m->vmp_written_by_kernel = TRUE;
1643                                         }
1644
1645                                         /*
1646                                          * If the object is purgeable, its
1647                                          * owner's purgeable ledgers have been
1648                                          * updated in vm_page_insert() but the
1649                                          * page was also accounted for in a
1650                                          * "compressed purgeable" ledger, so
1651                                          * update that now.
1652                                          */
1653                                         if (((object->purgable !=
1654                                             VM_PURGABLE_DENY) ||
1655                                             object->vo_ledger_tag) &&
1656                                             (object->vo_owner !=
1657                                             NULL)) {
1658                                                 /*
1659                                                  * One less compressed
1660                                                  * purgeable/tagged page.
1661                                                  */
1662                                                 vm_object_owner_compressed_update(
1663                                                         object,
1664                                                         -1);
1665                                         }
1666
1667                                         break;
1668                                 case KERN_MEMORY_FAILURE:
1669                                         m->vmp_unusual = TRUE;
1670                                         m->vmp_error = TRUE;
1671                                         m->vmp_absent = FALSE;
1672                                         break;
1673                                 case KERN_MEMORY_ERROR:
1674                                         assert(m->vmp_absent);
1675                                         break;
1676                                 default:
1677                                         panic("vm_fault_page(): unexpected "
1678                                             "error %d from "
1679                                             "vm_compressor_pager_get()\n",
1680                                             rc);
1681                                 }
1682                                 PAGE_WAKEUP_DONE(m);
1683
1684                                 rc = KERN_SUCCESS;
1685                                 goto data_requested;
1686                         }
1687                         my_fault_type = DBG_PAGEIN_FAULT;
1688
1689                         if (m != VM_PAGE_NULL) {
1690                                 VM_PAGE_FREE(m);
1691                                 m = VM_PAGE_NULL;
1692                         }
1693
1694 #if TRACEFAULTPAGE
1695                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1696 #endif
1697
1698                         /*
1699                          * It's possible someone called vm_object_destroy while we weren't
1700                          * holding the object lock.  If that has happened, then bail out
1701                          * here.
1702                          */
1703
1704                         pager = object->pager;
1705
1706                         if (pager == MEMORY_OBJECT_NULL) {
1707                                 vm_fault_cleanup(object, first_m);
1708                                 thread_interrupt_level(interruptible_state);
1709                                 return VM_FAULT_MEMORY_ERROR;
1710                         }
1711
1712                         /*
1713                          * We have an absent page in place for the faulting offset,
1714                          * so we can release the object lock.
1715                          */
1716
1717                         if (object->object_is_shared_cache) {
1718                                 set_thread_rwlock_boost();
1719                         }
1720
1721                         vm_object_unlock(object);
1722
1723                         /*
1724                          * If this object uses a copy_call strategy,
1725                          * and we are interested in a copy of this object
1726                          * (having gotten here only by following a
1727                          * shadow chain), then tell the memory manager
1728                          * via a flag added to the desired_access
1729                          * parameter, so that it can detect a race
1730                          * between our walking down the shadow chain
1731                          * and its pushing pages up into a copy of
1732                          * the object that it manages.
1733                          */
1734                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1735                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1736                         } else {
1737                                 wants_copy_flag = VM_PROT_NONE;
1738                         }
1739
1740                         if (object->copy == first_object) {
1741                                 /*
1742                                  * if we issue the memory_object_data_request in
1743                                  * this state, we are subject to a deadlock with
1744                                  * the underlying filesystem if it is trying to
1745                                  * shrink the file resulting in a push of pages
1746                                  * into the copy object...  that push will stall
1747                                  * on the placeholder page, and if the pushing thread
1748                                  * is holding a lock that is required on the pagein
1749                                  * path (such as a truncate lock), we'll deadlock...
1750                                  * to avoid this potential deadlock, we throw away
1751                                  * our placeholder page before calling memory_object_data_request
1752                                  * and force this thread to retry the vm_fault_page after
1753                                  * we have issued the I/O.  the second time through this path
1754                                  * we will find the page already in the cache (presumably still
1755                                  * busy waiting for the I/O to complete) and then complete
1756                                  * the fault w/o having to go through memory_object_data_request again
1757                                  */
1758                                 assert(first_m != VM_PAGE_NULL);
1759                                 assert(VM_PAGE_OBJECT(first_m) == first_object);
1760
1761                                 vm_object_lock(first_object);
1762                                 VM_PAGE_FREE(first_m);
1763                                 vm_object_paging_end(first_object);
1764                                 vm_object_unlock(first_object);
1765
1766                                 first_m = VM_PAGE_NULL;
1767                                 force_fault_retry = TRUE;
1768
1769                                 vm_fault_page_forced_retry++;
1770                         }
1771
1772                         if (data_already_requested == TRUE) {
1773                                 orig_behavior = fault_info->behavior;
1774                                 orig_cluster_size = fault_info->cluster_size;
1775
1776                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1777                                 fault_info->cluster_size = PAGE_SIZE;
1778                         }
1779                         /*
1780                          * Call the memory manager to retrieve the data.
1781                          */
1782                         rc = memory_object_data_request(
1783                                 pager,
1784                                 vm_object_trunc_page(offset) + object->paging_offset,
1785                                 PAGE_SIZE,
1786                                 access_required | wants_copy_flag,
1787                                 (memory_object_fault_info_t)fault_info);
1788
1789                         if (data_already_requested == TRUE) {
1790                                 fault_info->behavior = orig_behavior;
1791                                 fault_info->cluster_size = orig_cluster_size;
1792                         } else {
1793                                 data_already_requested = TRUE;
1794                         }
1795
1796                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1797 #if TRACEFAULTPAGE
1798                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1799 #endif
1800                         vm_object_lock(object);
1801
1802                         if (object->object_is_shared_cache) {
1803                                 clear_thread_rwlock_boost();
1804                         }
1805
1806 data_requested:
1807                         if (rc != KERN_SUCCESS) {
1808                                 vm_fault_cleanup(object, first_m);
1809                                 thread_interrupt_level(interruptible_state);
1810
1811                                 return (rc == MACH_SEND_INTERRUPTED) ?
1812                                        VM_FAULT_INTERRUPTED :
1813                                        VM_FAULT_MEMORY_ERROR;
1814                         } else {
1815                                 clock_sec_t     tv_sec;
1816                                 clock_usec_t    tv_usec;
1817
1818                                 if (my_fault_type == DBG_PAGEIN_FAULT) {
1819                                         clock_get_system_microtime(&tv_sec, &tv_usec);
1820                                         current_thread()->t_page_creation_time = tv_sec;
1821                                         current_thread()->t_page_creation_count = 0;
1822                                 }
1823                         }
1824                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1825                                 vm_fault_cleanup(object, first_m);
1826                                 thread_interrupt_level(interruptible_state);
1827
1828                                 return VM_FAULT_INTERRUPTED;
1829                         }
1830                         if (force_fault_retry == TRUE) {
1831                                 vm_fault_cleanup(object, first_m);
1832                                 thread_interrupt_level(interruptible_state);
1833
1834                                 return VM_FAULT_RETRY;
1835                         }
1836                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1837                                 /*
1838                                  * No page here means that the object we
1839                                  * initially looked up was "physically
1840                                  * contiguous" (i.e. device memory).  However,
1841                                  * with Virtual VRAM, the object might not
1842                                  * be backed by that device memory anymore,
1843                                  * so we're done here only if the object is
1844                                  * still "phys_contiguous".
1845                                  * Otherwise, if the object is no longer
1846                                  * "phys_contiguous", we need to retry the
1847                                  * page fault against the object's new backing
1848                                  * store (different memory object).
1849                                  */
1850 phys_contig_object:
1851                                 goto done;
1852                         }
1853                         /*
1854                          * potentially a pagein fault
1855                          * if we make it through the state checks
1856                          * above, than we'll count it as such
1857                          */
1858                         my_fault = my_fault_type;
1859
1860                         /*
1861                          * Retry with same object/offset, since new data may
1862                          * be in a different page (i.e., m is meaningless at
1863                          * this point).
1864                          */
1865                         continue;
1866                 }
1867 dont_look_for_page:
1868                 /*
1869                  * We get here if the object has no pager, or an existence map
1870                  * exists and indicates the page isn't present on the pager
1871                  * or we're unwiring a page.  If a pager exists, but there
1872                  * is no existence map, then the m->vmp_absent case above handles
1873                  * the ZF case when the pager can't provide the page
1874                  */
1875 #if TRACEFAULTPAGE
1876                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1877 #endif
1878                 if (object == first_object) {
1879                         first_m = m;
1880                 } else {
1881                         assert(m == VM_PAGE_NULL);
1882                 }
1883
1884                 next_object = object->shadow;
1885
1886                 if (next_object == VM_OBJECT_NULL) {
1887                         /*
1888                          * we've hit the bottom of the shadown chain,
1889                          * fill the page in the top object with zeros.
1890                          */
1891                         assert(!must_be_resident);
1892
1893                         if (object != first_object) {
1894                                 vm_object_paging_end(object);
1895                                 vm_object_unlock(object);
1896
1897                                 object = first_object;
1898                                 offset = first_offset;
1899                                 vm_object_lock(object);
1900                         }
1901                         m = first_m;
1902                         assert(VM_PAGE_OBJECT(m) == object);
1903                         first_m = VM_PAGE_NULL;
1904
1905                         /*
1906                          * check for any conditions that prevent
1907                          * us from creating a new zero-fill page
1908                          * vm_fault_check will do all of the
1909                          * fault cleanup in the case of an error condition
1910                          * including resetting the thread_interrupt_level
1911                          */
1912                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1913
1914                         if (error != VM_FAULT_SUCCESS) {
1915                                 return error;
1916                         }
1917
1918                         if (m == VM_PAGE_NULL) {
1919                                 m = vm_page_grab_options(grab_options);
1920
1921                                 if (m == VM_PAGE_NULL) {
1922                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1923                                         thread_interrupt_level(interruptible_state);
1924
1925                                         return VM_FAULT_MEMORY_SHORTAGE;
1926                                 }
1927                                 vm_page_insert(m, object, vm_object_trunc_page(offset));
1928                         }
1929                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1930                                 m->vmp_absent = TRUE;
1931                         }
1932
1933                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1934
1935                         break;
1936                 } else {
1937                         /*
1938                          * Move on to the next object.  Lock the next
1939                          * object before unlocking the current one.
1940                          */
1941                         if ((object != first_object) || must_be_resident) {
1942                                 vm_object_paging_end(object);
1943                         }
1944
1945                         offset += object->vo_shadow_offset;
1946                         fault_info->lo_offset += object->vo_shadow_offset;
1947                         fault_info->hi_offset += object->vo_shadow_offset;
1948                         access_required = VM_PROT_READ;
1949
1950                         vm_object_lock(next_object);
1951                         vm_object_unlock(object);
1952
1953                         object = next_object;
1954                         vm_object_paging_begin(object);
1955                 }
1956         }
1957
1958         /*
1959          *      PAGE HAS BEEN FOUND.
1960          *
1961          *      This page (m) is:
1962          *              busy, so that we can play with it;
1963          *              not absent, so that nobody else will fill it;
1964          *              possibly eligible for pageout;
1965          *
1966          *      The top-level page (first_m) is:
1967          *              VM_PAGE_NULL if the page was found in the
1968          *               top-level object;
1969          *              busy, not absent, and ineligible for pageout.
1970          *
1971          *      The current object (object) is locked.  A paging
1972          *      reference is held for the current and top-level
1973          *      objects.
1974          */
1975
1976 #if TRACEFAULTPAGE
1977         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1978 #endif
1979 #if     EXTRA_ASSERTIONS
1980         assert(m->vmp_busy && !m->vmp_absent);
1981         assert((first_m == VM_PAGE_NULL) ||
1982             (first_m->vmp_busy && !first_m->vmp_absent &&
1983             !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
1984 #endif  /* EXTRA_ASSERTIONS */
1985
1986         /*
1987          * If the page is being written, but isn't
1988          * already owned by the top-level object,
1989          * we have to copy it into a new page owned
1990          * by the top-level object.
1991          */
1992         if (object != first_object) {
1993 #if TRACEFAULTPAGE
1994                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1995 #endif
1996                 if (fault_type & VM_PROT_WRITE) {
1997                         vm_page_t copy_m;
1998
1999                         /*
2000                          * We only really need to copy if we
2001                          * want to write it.
2002                          */
2003                         assert(!must_be_resident);
2004
2005                         /*
2006                          * If we try to collapse first_object at this
2007                          * point, we may deadlock when we try to get
2008                          * the lock on an intermediate object (since we
2009                          * have the bottom object locked).  We can't
2010                          * unlock the bottom object, because the page
2011                          * we found may move (by collapse) if we do.
2012                          *
2013                          * Instead, we first copy the page.  Then, when
2014                          * we have no more use for the bottom object,
2015                          * we unlock it and try to collapse.
2016                          *
2017                          * Note that we copy the page even if we didn't
2018                          * need to... that's the breaks.
2019                          */
2020
2021                         /*
2022                          * Allocate a page for the copy
2023                          */
2024                         copy_m = vm_page_grab_options(grab_options);
2025
2026                         if (copy_m == VM_PAGE_NULL) {
2027                                 RELEASE_PAGE(m);
2028
2029                                 vm_fault_cleanup(object, first_m);
2030                                 thread_interrupt_level(interruptible_state);
2031
2032                                 return VM_FAULT_MEMORY_SHORTAGE;
2033                         }
2034
2035                         vm_page_copy(m, copy_m);
2036
2037                         /*
2038                          * If another map is truly sharing this
2039                          * page with us, we have to flush all
2040                          * uses of the original page, since we
2041                          * can't distinguish those which want the
2042                          * original from those which need the
2043                          * new copy.
2044                          *
2045                          * XXXO If we know that only one map has
2046                          * access to this page, then we could
2047                          * avoid the pmap_disconnect() call.
2048                          */
2049                         if (m->vmp_pmapped) {
2050                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2051                         }
2052
2053                         if (m->vmp_clustered) {
2054                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2055                                 VM_PAGE_CONSUME_CLUSTERED(m);
2056                         }
2057                         assert(!m->vmp_cleaning);
2058
2059                         /*
2060                          * We no longer need the old page or object.
2061                          */
2062                         RELEASE_PAGE(m);
2063
2064                         /*
2065                          * This check helps with marking the object as having a sequential pattern
2066                          * Normally we'll miss doing this below because this fault is about COW to
2067                          * the first_object i.e. bring page in from disk, push to object above but
2068                          * don't update the file object's sequential pattern.
2069                          */
2070                         if (object->internal == FALSE) {
2071                                 vm_fault_is_sequential(object, offset, fault_info->behavior);
2072                         }
2073
2074                         vm_object_paging_end(object);
2075                         vm_object_unlock(object);
2076
2077                         my_fault = DBG_COW_FAULT;
2078                         VM_STAT_INCR(cow_faults);
2079                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2080                         current_task()->cow_faults++;
2081
2082                         object = first_object;
2083                         offset = first_offset;
2084
2085                         vm_object_lock(object);
2086                         /*
2087                          * get rid of the place holder
2088                          * page that we soldered in earlier
2089                          */
2090                         VM_PAGE_FREE(first_m);
2091                         first_m = VM_PAGE_NULL;
2092
2093                         /*
2094                          * and replace it with the
2095                          * page we just copied into
2096                          */
2097                         assert(copy_m->vmp_busy);
2098                         vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
2099                         SET_PAGE_DIRTY(copy_m, TRUE);
2100
2101                         m = copy_m;
2102                         /*
2103                          * Now that we've gotten the copy out of the
2104                          * way, let's try to collapse the top object.
2105                          * But we have to play ugly games with
2106                          * paging_in_progress to do that...
2107                          */
2108                         vm_object_paging_end(object);
2109                         vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
2110                         vm_object_paging_begin(object);
2111                 } else {
2112                         *protection &= (~VM_PROT_WRITE);
2113                 }
2114         }
2115         /*
2116          * Now check whether the page needs to be pushed into the
2117          * copy object.  The use of asymmetric copy on write for
2118          * shared temporary objects means that we may do two copies to
2119          * satisfy the fault; one above to get the page from a
2120          * shadowed object, and one here to push it into the copy.
2121          */
2122         try_failed_count = 0;
2123
2124         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2125                 vm_object_offset_t      copy_offset;
2126                 vm_page_t               copy_m;
2127
2128 #if TRACEFAULTPAGE
2129                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2130 #endif
2131                 /*
2132                  * If the page is being written, but hasn't been
2133                  * copied to the copy-object, we have to copy it there.
2134                  */
2135                 if ((fault_type & VM_PROT_WRITE) == 0) {
2136                         *protection &= ~VM_PROT_WRITE;
2137                         break;
2138                 }
2139
2140                 /*
2141                  * If the page was guaranteed to be resident,
2142                  * we must have already performed the copy.
2143                  */
2144                 if (must_be_resident) {
2145                         break;
2146                 }
2147
2148                 /*
2149                  * Try to get the lock on the copy_object.
2150                  */
2151                 if (!vm_object_lock_try(copy_object)) {
2152                         vm_object_unlock(object);
2153                         try_failed_count++;
2154
2155                         mutex_pause(try_failed_count);  /* wait a bit */
2156                         vm_object_lock(object);
2157
2158                         continue;
2159                 }
2160                 try_failed_count = 0;
2161
2162                 /*
2163                  * Make another reference to the copy-object,
2164                  * to keep it from disappearing during the
2165                  * copy.
2166                  */
2167                 vm_object_reference_locked(copy_object);
2168
2169                 /*
2170                  * Does the page exist in the copy?
2171                  */
2172                 copy_offset = first_offset - copy_object->vo_shadow_offset;
2173                 copy_offset = vm_object_trunc_page(copy_offset);
2174
2175                 if (copy_object->vo_size <= copy_offset) {
2176                         /*
2177                          * Copy object doesn't cover this page -- do nothing.
2178                          */
2179                         ;
2180                 } else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2181                         /*
2182                          * Page currently exists in the copy object
2183                          */
2184                         if (copy_m->vmp_busy) {
2185                                 /*
2186                                  * If the page is being brought
2187                                  * in, wait for it and then retry.
2188                                  */
2189                                 RELEASE_PAGE(m);
2190
2191                                 /*
2192                                  * take an extra ref so object won't die
2193                                  */
2194                                 vm_object_reference_locked(copy_object);
2195                                 vm_object_unlock(copy_object);
2196                                 vm_fault_cleanup(object, first_m);
2197                                 counter(c_vm_fault_page_block_backoff_kernel++);
2198
2199                                 vm_object_lock(copy_object);
2200                                 assert(copy_object->ref_count > 0);
2201                                 VM_OBJ_RES_DECR(copy_object);
2202                                 vm_object_lock_assert_exclusive(copy_object);
2203                                 copy_object->ref_count--;
2204                                 assert(copy_object->ref_count > 0);
2205                                 copy_m = vm_page_lookup(copy_object, copy_offset);
2206
2207                                 if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2208                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
2209
2210                                         vm_object_unlock(copy_object);
2211                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2212                                         vm_object_deallocate(copy_object);
2213
2214                                         goto backoff;
2215                                 } else {
2216                                         vm_object_unlock(copy_object);
2217                                         vm_object_deallocate(copy_object);
2218                                         thread_interrupt_level(interruptible_state);
2219
2220                                         return VM_FAULT_RETRY;
2221                                 }
2222                         }
2223                 } else if (!PAGED_OUT(copy_object, copy_offset)) {
2224                         /*
2225                          * If PAGED_OUT is TRUE, then the page used to exist
2226                          * in the copy-object, and has already been paged out.
2227                          * We don't need to repeat this. If PAGED_OUT is
2228                          * FALSE, then either we don't know (!pager_created,
2229                          * for example) or it hasn't been paged out.
2230                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2231                          * We must copy the page to the copy object.
2232                          *
2233                          * Allocate a page for the copy
2234                          */
2235                         copy_m = vm_page_alloc(copy_object, copy_offset);
2236
2237                         if (copy_m == VM_PAGE_NULL) {
2238                                 RELEASE_PAGE(m);
2239
2240                                 VM_OBJ_RES_DECR(copy_object);
2241                                 vm_object_lock_assert_exclusive(copy_object);
2242                                 copy_object->ref_count--;
2243                                 assert(copy_object->ref_count > 0);
2244
2245                                 vm_object_unlock(copy_object);
2246                                 vm_fault_cleanup(object, first_m);
2247                                 thread_interrupt_level(interruptible_state);
2248
2249                                 return VM_FAULT_MEMORY_SHORTAGE;
2250                         }
2251                         /*
2252                          * Must copy page into copy-object.
2253                          */
2254                         vm_page_copy(m, copy_m);
2255
2256                         /*
2257                          * If the old page was in use by any users
2258                          * of the copy-object, it must be removed
2259                          * from all pmaps.  (We can't know which
2260                          * pmaps use it.)
2261                          */
2262                         if (m->vmp_pmapped) {
2263                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2264                         }
2265
2266                         if (m->vmp_clustered) {
2267                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2268                                 VM_PAGE_CONSUME_CLUSTERED(m);
2269                         }
2270                         /*
2271                          * If there's a pager, then immediately
2272                          * page out this page, using the "initialize"
2273                          * option.  Else, we use the copy.
2274                          */
2275                         if ((!copy_object->pager_ready)
2276                             || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2277                             ) {
2278                                 vm_page_lockspin_queues();
2279                                 assert(!m->vmp_cleaning);
2280                                 vm_page_activate(copy_m);
2281                                 vm_page_unlock_queues();
2282
2283                                 SET_PAGE_DIRTY(copy_m, TRUE);
2284                                 PAGE_WAKEUP_DONE(copy_m);
2285                         } else {
2286                                 assert(copy_m->vmp_busy == TRUE);
2287                                 assert(!m->vmp_cleaning);
2288
2289                                 /*
2290                                  * dirty is protected by the object lock
2291                                  */
2292                                 SET_PAGE_DIRTY(copy_m, TRUE);
2293
2294                                 /*
2295                                  * The page is already ready for pageout:
2296                                  * not on pageout queues and busy.
2297                                  * Unlock everything except the
2298                                  * copy_object itself.
2299                                  */
2300                                 vm_object_unlock(object);
2301
2302                                 /*
2303                                  * Write the page to the copy-object,
2304                                  * flushing it from the kernel.
2305                                  */
2306                                 vm_pageout_initialize_page(copy_m);
2307
2308                                 /*
2309                                  * Since the pageout may have
2310                                  * temporarily dropped the
2311                                  * copy_object's lock, we
2312                                  * check whether we'll have
2313                                  * to deallocate the hard way.
2314                                  */
2315                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2316                                         vm_object_unlock(copy_object);
2317                                         vm_object_deallocate(copy_object);
2318                                         vm_object_lock(object);
2319
2320                                         continue;
2321                                 }
2322                                 /*
2323                                  * Pick back up the old object's
2324                                  * lock.  [It is safe to do so,
2325                                  * since it must be deeper in the
2326                                  * object tree.]
2327                                  */
2328                                 vm_object_lock(object);
2329                         }
2330
2331                         /*
2332                          * Because we're pushing a page upward
2333                          * in the object tree, we must restart
2334                          * any faults that are waiting here.
2335                          * [Note that this is an expansion of
2336                          * PAGE_WAKEUP that uses the THREAD_RESTART
2337                          * wait result].  Can't turn off the page's
2338                          * busy bit because we're not done with it.
2339                          */
2340                         if (m->vmp_wanted) {
2341                                 m->vmp_wanted = FALSE;
2342                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2343                         }
2344                 }
2345                 /*
2346                  * The reference count on copy_object must be
2347                  * at least 2: one for our extra reference,
2348                  * and at least one from the outside world
2349                  * (we checked that when we last locked
2350                  * copy_object).
2351                  */
2352                 vm_object_lock_assert_exclusive(copy_object);
2353                 copy_object->ref_count--;
2354                 assert(copy_object->ref_count > 0);
2355
2356                 VM_OBJ_RES_DECR(copy_object);
2357                 vm_object_unlock(copy_object);
2358
2359                 break;
2360         }
2361
2362 done:
2363         *result_page = m;
2364         *top_page = first_m;
2365
2366         if (m != VM_PAGE_NULL) {
2367                 assert(VM_PAGE_OBJECT(m) == object);
2368
2369                 retval = VM_FAULT_SUCCESS;
2370
2371                 if (my_fault == DBG_PAGEIN_FAULT) {
2372                         VM_PAGE_COUNT_AS_PAGEIN(m);
2373
2374                         if (object->internal) {
2375                                 my_fault = DBG_PAGEIND_FAULT;
2376                         } else {
2377                                 my_fault = DBG_PAGEINV_FAULT;
2378                         }
2379
2380                         /*
2381                          * evaluate access pattern and update state
2382                          * vm_fault_deactivate_behind depends on the
2383                          * state being up to date
2384                          */
2385                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2386                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2387                 } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2388                         /*
2389                          * we weren't called from vm_fault, so handle the
2390                          * accounting here for hits in the cache
2391                          */
2392                         if (m->vmp_clustered) {
2393                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2394                                 VM_PAGE_CONSUME_CLUSTERED(m);
2395                         }
2396                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2397                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2398                 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2399                         VM_STAT_DECOMPRESSIONS();
2400                 }
2401                 if (type_of_fault) {
2402                         *type_of_fault = my_fault;
2403                 }
2404         } else {
2405                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2406                 assert(first_m == VM_PAGE_NULL);
2407                 assert(object == first_object);
2408         }
2409
2410         thread_interrupt_level(interruptible_state);
2411
2412 #if TRACEFAULTPAGE
2413         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2414 #endif
2415         return retval;
2416
2417 backoff:
2418         thread_interrupt_level(interruptible_state);
2419
2420         if (wait_result == THREAD_INTERRUPTED) {
2421                 return VM_FAULT_INTERRUPTED;
2422         }
2423         return VM_FAULT_RETRY;
2424
2425 #undef  RELEASE_PAGE
2426 }
2427
2428
2429 extern int panic_on_cs_killed;
2430 extern int proc_selfpid(void);
2431 extern char *proc_name_address(void *p);
2432 unsigned long cs_enter_tainted_rejected = 0;
2433 unsigned long cs_enter_tainted_accepted = 0;
2434
2435 /*
2436  * CODE SIGNING:
2437  * When soft faulting a page, we have to validate the page if:
2438  * 1. the page is being mapped in user space
2439  * 2. the page hasn't already been found to be "tainted"
2440  * 3. the page belongs to a code-signed object
2441  * 4. the page has not been validated yet or has been mapped for write.
2442  */
2443 static bool
2444 vm_fault_cs_need_validation(
2445         pmap_t pmap,
2446         vm_page_t page,
2447         vm_object_t page_obj,
2448         vm_map_size_t fault_page_size,
2449         vm_map_offset_t fault_phys_offset)
2450 {
2451         if (pmap == kernel_pmap) {
2452                 /* 1 - not user space */
2453                 return false;
2454         }
2455         if (!page_obj->code_signed) {
2456                 /* 3 - page does not belong to a code-signed object */
2457                 return false;
2458         }
2459         if (fault_page_size == PAGE_SIZE) {
2460                 /* looking at the whole page */
2461                 assertf(fault_phys_offset == 0,
2462                     "fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
2463                     (uint64_t)fault_page_size,
2464                     (uint64_t)fault_phys_offset);
2465                 if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
2466                         /* 2 - page is all tainted */
2467                         return false;
2468                 }
2469                 if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
2470                     !page->vmp_wpmapped) {
2471                         /* 4 - already fully validated and never mapped writable */
2472                         return false;
2473                 }
2474         } else {
2475                 /* looking at a specific sub-page */
2476                 if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
2477                         /* 2 - sub-page was already marked as tainted */
2478                         return false;
2479                 }
2480                 if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
2481                     !page->vmp_wpmapped) {
2482                         /* 4 - already validated and never mapped writable */
2483                         return false;
2484                 }
2485         }
2486         /* page needs to be validated */
2487         return true;
2488 }
2489
2490
2491 static bool
2492 vm_fault_cs_page_immutable(
2493         vm_page_t m,
2494         vm_map_size_t fault_page_size,
2495         vm_map_offset_t fault_phys_offset,
2496         vm_prot_t prot __unused)
2497 {
2498         if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
2499             /*&& ((prot) & VM_PROT_EXECUTE)*/) {
2500                 return true;
2501         }
2502         return false;
2503 }
2504
2505 static bool
2506 vm_fault_cs_page_nx(
2507         vm_page_t m,
2508         vm_map_size_t fault_page_size,
2509         vm_map_offset_t fault_phys_offset)
2510 {
2511         return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2512 }
2513
2514 /*
2515  * Check if the page being entered into the pmap violates code signing.
2516  */
2517 static kern_return_t
2518 vm_fault_cs_check_violation(
2519         bool cs_bypass,
2520         vm_object_t object,
2521         vm_page_t m,
2522         pmap_t pmap,
2523         vm_prot_t prot,
2524         vm_prot_t caller_prot,
2525         vm_map_size_t fault_page_size,
2526         vm_map_offset_t fault_phys_offset,
2527         vm_object_fault_info_t fault_info,
2528         bool map_is_switched,
2529         bool map_is_switch_protected,
2530         bool *cs_violation)
2531 {
2532 #if !PMAP_CS
2533 #pragma unused(caller_prot)
2534 #pragma unused(fault_info)
2535 #endif /* !PMAP_CS */
2536         int             cs_enforcement_enabled;
2537         if (!cs_bypass &&
2538             vm_fault_cs_need_validation(pmap, m, object,
2539             fault_page_size, fault_phys_offset)) {
2540                 vm_object_lock_assert_exclusive(object);
2541
2542                 if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
2543                         vm_cs_revalidates++;
2544                 }
2545
2546                 /* VM map is locked, so 1 ref will remain on VM object -
2547                  * so no harm if vm_page_validate_cs drops the object lock */
2548
2549                 vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2550         }
2551
2552         /* If the map is switched, and is switch-protected, we must protect
2553          * some pages from being write-faulted: immutable pages because by
2554          * definition they may not be written, and executable pages because that
2555          * would provide a way to inject unsigned code.
2556          * If the page is immutable, we can simply return. However, we can't
2557          * immediately determine whether a page is executable anywhere. But,
2558          * we can disconnect it everywhere and remove the executable protection
2559          * from the current map. We do that below right before we do the
2560          * PMAP_ENTER.
2561          */
2562         if (pmap == kernel_pmap) {
2563                 /* kernel fault: cs_enforcement does not apply */
2564                 cs_enforcement_enabled = 0;
2565         } else {
2566                 cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
2567         }
2568
2569         if (cs_enforcement_enabled && map_is_switched &&
2570             map_is_switch_protected &&
2571             vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2572             (prot & VM_PROT_WRITE)) {
2573                 return KERN_CODESIGN_ERROR;
2574         }
2575
2576         if (cs_enforcement_enabled &&
2577             vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
2578             (prot & VM_PROT_EXECUTE)) {
2579                 if (cs_debug) {
2580                         printf("page marked to be NX, not letting it be mapped EXEC\n");
2581                 }
2582                 return KERN_CODESIGN_ERROR;
2583         }
2584
2585         /* A page could be tainted, or pose a risk of being tainted later.
2586          * Check whether the receiving process wants it, and make it feel
2587          * the consequences (that hapens in cs_invalid_page()).
2588          * For CS Enforcement, two other conditions will
2589          * cause that page to be tainted as well:
2590          * - pmapping an unsigned page executable - this means unsigned code;
2591          * - writeable mapping of a validated page - the content of that page
2592          *   can be changed without the kernel noticing, therefore unsigned
2593          *   code can be created
2594          */
2595         if (cs_bypass) {
2596                 /* code-signing is bypassed */
2597                 *cs_violation = FALSE;
2598         } else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
2599                 /* tainted page */
2600                 *cs_violation = TRUE;
2601         } else if (!cs_enforcement_enabled) {
2602                 /* no further code-signing enforcement */
2603                 *cs_violation = FALSE;
2604         } else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2605             ((prot & VM_PROT_WRITE) ||
2606             m->vmp_wpmapped)) {
2607                 /*
2608                  * The page should be immutable, but is in danger of being
2609                  * modified.
2610                  * This is the case where we want policy from the code
2611                  * directory - is the page immutable or not? For now we have
2612                  * to assume that code pages will be immutable, data pages not.
2613                  * We'll assume a page is a code page if it has a code directory
2614                  * and we fault for execution.
2615                  * That is good enough since if we faulted the code page for
2616                  * writing in another map before, it is wpmapped; if we fault
2617                  * it for writing in this map later it will also be faulted for
2618                  * executing at the same time; and if we fault for writing in
2619                  * another map later, we will disconnect it from this pmap so
2620                  * we'll notice the change.
2621                  */
2622                 *cs_violation = TRUE;
2623         } else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2624             (prot & VM_PROT_EXECUTE)
2625             ) {
2626                 *cs_violation = TRUE;
2627         } else {
2628                 *cs_violation = FALSE;
2629         }
2630         return KERN_SUCCESS;
2631 }
2632
2633 /*
2634  * Handles a code signing violation by either rejecting the page or forcing a disconnect.
2635  * @param must_disconnect This value will be set to true if the caller must disconnect
2636  * this page.
2637  * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2638  */
2639 static kern_return_t
2640 vm_fault_cs_handle_violation(
2641         vm_object_t object,
2642         vm_page_t m,
2643         pmap_t pmap,
2644         vm_prot_t prot,
2645         vm_map_offset_t vaddr,
2646         vm_map_size_t fault_page_size,
2647         vm_map_offset_t fault_phys_offset,
2648         bool map_is_switched,
2649         bool map_is_switch_protected,
2650         bool *must_disconnect)
2651 {
2652 #if !MACH_ASSERT
2653 #pragma unused(pmap)
2654 #pragma unused(map_is_switch_protected)
2655 #endif /* !MACH_ASSERT */
2656         /*
2657          * We will have a tainted page. Have to handle the special case
2658          * of a switched map now. If the map is not switched, standard
2659          * procedure applies - call cs_invalid_page().
2660          * If the map is switched, the real owner is invalid already.
2661          * There is no point in invalidating the switching process since
2662          * it will not be executing from the map. So we don't call
2663          * cs_invalid_page() in that case.
2664          */
2665         boolean_t reject_page, cs_killed;
2666         kern_return_t kr;
2667         if (map_is_switched) {
2668                 assert(pmap == vm_map_pmap(current_thread()->map));
2669                 assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2670                 reject_page = FALSE;
2671         } else {
2672                 if (cs_debug > 5) {
2673                         printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2674                             object->code_signed ? "yes" : "no",
2675                             VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2676                             VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2677                             m->vmp_wpmapped ? "yes" : "no",
2678                             (int)prot);
2679                 }
2680                 reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2681         }
2682
2683         if (reject_page) {
2684                 /* reject the invalid page: abort the page fault */
2685                 int                     pid;
2686                 const char              *procname;
2687                 task_t                  task;
2688                 vm_object_t             file_object, shadow;
2689                 vm_object_offset_t      file_offset;
2690                 char                    *pathname, *filename;
2691                 vm_size_t               pathname_len, filename_len;
2692                 boolean_t               truncated_path;
2693 #define __PATH_MAX 1024
2694                 struct timespec         mtime, cs_mtime;
2695                 int                     shadow_depth;
2696                 os_reason_t             codesigning_exit_reason = OS_REASON_NULL;
2697
2698                 kr = KERN_CODESIGN_ERROR;
2699                 cs_enter_tainted_rejected++;
2700
2701                 /* get process name and pid */
2702                 procname = "?";
2703                 task = current_task();
2704                 pid = proc_selfpid();
2705                 if (task->bsd_info != NULL) {
2706                         procname = proc_name_address(task->bsd_info);
2707                 }
2708
2709                 /* get file's VM object */
2710                 file_object = object;
2711                 file_offset = m->vmp_offset;
2712                 for (shadow = file_object->shadow,
2713                     shadow_depth = 0;
2714                     shadow != VM_OBJECT_NULL;
2715                     shadow = file_object->shadow,
2716                     shadow_depth++) {
2717                         vm_object_lock_shared(shadow);
2718                         if (file_object != object) {
2719                                 vm_object_unlock(file_object);
2720                         }
2721                         file_offset += file_object->vo_shadow_offset;
2722                         file_object = shadow;
2723                 }
2724
2725                 mtime.tv_sec = 0;
2726                 mtime.tv_nsec = 0;
2727                 cs_mtime.tv_sec = 0;
2728                 cs_mtime.tv_nsec = 0;
2729
2730                 /* get file's pathname and/or filename */
2731                 pathname = NULL;
2732                 filename = NULL;
2733                 pathname_len = 0;
2734                 filename_len = 0;
2735                 truncated_path = FALSE;
2736                 /* no pager -> no file -> no pathname, use "<nil>" in that case */
2737                 if (file_object->pager != NULL) {
2738                         pathname = kheap_alloc(KHEAP_TEMP, __PATH_MAX * 2, Z_WAITOK);
2739                         if (pathname) {
2740                                 pathname[0] = '\0';
2741                                 pathname_len = __PATH_MAX;
2742                                 filename = pathname + pathname_len;
2743                                 filename_len = __PATH_MAX;
2744
2745                                 if (vnode_pager_get_object_name(file_object->pager,
2746                                     pathname,
2747                                     pathname_len,
2748                                     filename,
2749                                     filename_len,
2750                                     &truncated_path) == KERN_SUCCESS) {
2751                                         /* safety first... */
2752                                         pathname[__PATH_MAX - 1] = '\0';
2753                                         filename[__PATH_MAX - 1] = '\0';
2754
2755                                         vnode_pager_get_object_mtime(file_object->pager,
2756                                             &mtime,
2757                                             &cs_mtime);
2758                                 } else {
2759                                         kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
2760                                         pathname = NULL;
2761                                         filename = NULL;
2762                                         pathname_len = 0;
2763                                         filename_len = 0;
2764                                         truncated_path = FALSE;
2765                                 }
2766                         }
2767                 }
2768                 printf("CODE SIGNING: process %d[%s]: "
2769                     "rejecting invalid page at address 0x%llx "
2770                     "from offset 0x%llx in file \"%s%s%s\" "
2771                     "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2772                     "(signed:%d validated:%d tainted:%d nx:%d "
2773                     "wpmapped:%d dirty:%d depth:%d)\n",
2774                     pid, procname, (addr64_t) vaddr,
2775                     file_offset,
2776                     (pathname ? pathname : "<nil>"),
2777                     (truncated_path ? "/.../" : ""),
2778                     (truncated_path ? filename : ""),
2779                     cs_mtime.tv_sec, cs_mtime.tv_nsec,
2780                     ((cs_mtime.tv_sec == mtime.tv_sec &&
2781                     cs_mtime.tv_nsec == mtime.tv_nsec)
2782                     ? "=="
2783                     : "!="),
2784                     mtime.tv_sec, mtime.tv_nsec,
2785                     object->code_signed,
2786                     VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2787                     VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2788                     VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2789                     m->vmp_wpmapped,
2790                     m->vmp_dirty,
2791                     shadow_depth);
2792
2793                 /*
2794                  * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2795                  * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2796                  * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2797                  * will deal with the segmentation fault.
2798                  */
2799                 if (cs_killed) {
2800                         KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2801                             pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
2802
2803                         codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2804                         if (codesigning_exit_reason == NULL) {
2805                                 printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2806                         } else {
2807                                 mach_vm_address_t data_addr = 0;
2808                                 struct codesigning_exit_reason_info *ceri = NULL;
2809                                 uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2810
2811                                 if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2812                                         printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2813                                 } else {
2814                                         if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2815                                             EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2816                                                 ceri = (struct codesigning_exit_reason_info *)data_addr;
2817                                                 static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2818
2819                                                 ceri->ceri_virt_addr = vaddr;
2820                                                 ceri->ceri_file_offset = file_offset;
2821                                                 if (pathname) {
2822                                                         strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2823                                                 } else {
2824                                                         ceri->ceri_pathname[0] = '\0';
2825                                                 }
2826                                                 if (filename) {
2827                                                         strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2828                                                 } else {
2829                                                         ceri->ceri_filename[0] = '\0';
2830                                                 }
2831                                                 ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
2832                                                 ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2833                                                 ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2834                                                 ceri->ceri_page_modtime_secs = mtime.tv_sec;
2835                                                 ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2836                                                 ceri->ceri_object_codesigned = (object->code_signed);
2837                                                 ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
2838                                                 ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
2839                                                 ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2840                                                 ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2841                                                 ceri->ceri_page_slid = 0;
2842                                                 ceri->ceri_page_dirty = (m->vmp_dirty);
2843                                                 ceri->ceri_page_shadow_depth = shadow_depth;
2844                                         } else {
2845 #if DEBUG || DEVELOPMENT
2846                                                 panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2847 #else
2848                                                 printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2849 #endif /* DEBUG || DEVELOPMENT */
2850                                                 /* Free the buffer */
2851                                                 os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2852                                         }
2853                                 }
2854                         }
2855
2856                         set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2857                 }
2858                 if (panic_on_cs_killed &&
2859                     object->object_is_shared_cache) {
2860                         char *tainted_contents;
2861                         vm_map_offset_t src_vaddr;
2862                         src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
2863                         tainted_contents = kalloc(PAGE_SIZE);
2864                         bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
2865                         printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
2866                         panic("CODE SIGNING: process %d[%s]: "
2867                             "rejecting invalid page (phys#0x%x) at address 0x%llx "
2868                             "from offset 0x%llx in file \"%s%s%s\" "
2869                             "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2870                             "(signed:%d validated:%d tainted:%d nx:%d"
2871                             "wpmapped:%d dirty:%d depth:%d)\n",
2872                             pid, procname,
2873                             VM_PAGE_GET_PHYS_PAGE(m),
2874                             (addr64_t) vaddr,
2875                             file_offset,
2876                             (pathname ? pathname : "<nil>"),
2877                             (truncated_path ? "/.../" : ""),
2878                             (truncated_path ? filename : ""),
2879                             cs_mtime.tv_sec, cs_mtime.tv_nsec,
2880                             ((cs_mtime.tv_sec == mtime.tv_sec &&
2881                             cs_mtime.tv_nsec == mtime.tv_nsec)
2882                             ? "=="
2883                             : "!="),
2884                             mtime.tv_sec, mtime.tv_nsec,
2885                             object->code_signed,
2886                             VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2887                             VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2888                             VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2889                             m->vmp_wpmapped,
2890                             m->vmp_dirty,
2891                             shadow_depth);
2892                 }
2893
2894                 if (file_object != object) {
2895                         vm_object_unlock(file_object);
2896                 }
2897                 if (pathname_len != 0) {
2898                         kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
2899                         pathname = NULL;
2900                         filename = NULL;
2901                 }
2902         } else {
2903                 /* proceed with the invalid page */
2904                 kr = KERN_SUCCESS;
2905                 if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2906                     !object->code_signed) {
2907                         /*
2908                          * This page has not been (fully) validated but
2909                          * does not belong to a code-signed object
2910                          * so it should not be forcefully considered
2911                          * as tainted.
2912                          * We're just concerned about it here because
2913                          * we've been asked to "execute" it but that
2914                          * does not mean that it should cause other
2915                          * accesses to fail.
2916                          * This happens when a debugger sets a
2917                          * breakpoint and we then execute code in
2918                          * that page.  Marking the page as "tainted"
2919                          * would cause any inspection tool ("leaks",
2920                          * "vmmap", "CrashReporter", ...) to get killed
2921                          * due to code-signing violation on that page,
2922                          * even though they're just reading it and not
2923                          * executing from it.
2924                          */
2925                 } else {
2926                         /*
2927                          * Page might have been tainted before or not;
2928                          * now it definitively is. If the page wasn't
2929                          * tainted, we must disconnect it from all
2930                          * pmaps later, to force existing mappings
2931                          * through that code path for re-consideration
2932                          * of the validity of that page.
2933                          */
2934                         if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
2935                                 *must_disconnect = TRUE;
2936                                 VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
2937                         }
2938                 }
2939                 cs_enter_tainted_accepted++;
2940         }
2941         if (kr != KERN_SUCCESS) {
2942                 if (cs_debug) {
2943                         printf("CODESIGNING: vm_fault_enter(0x%llx): "
2944                             "*** INVALID PAGE ***\n",
2945                             (long long)vaddr);
2946                 }
2947 #if !SECURE_KERNEL
2948                 if (cs_enforcement_panic) {
2949                         panic("CODESIGNING: panicking on invalid page\n");
2950                 }
2951 #endif
2952         }
2953         return kr;
2954 }
2955
2956 /*
2957  * Check that the code signature is valid for the given page being inserted into
2958  * the pmap.
2959  *
2960  * @param must_disconnect This value will be set to true if the caller must disconnect
2961  * this page.
2962  * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2963  */
2964 static kern_return_t
2965 vm_fault_validate_cs(
2966         bool cs_bypass,
2967         vm_object_t object,
2968         vm_page_t m,
2969         pmap_t pmap,
2970         vm_map_offset_t vaddr,
2971         vm_prot_t prot,
2972         vm_prot_t caller_prot,
2973         vm_map_size_t fault_page_size,
2974         vm_map_offset_t fault_phys_offset,
2975         vm_object_fault_info_t fault_info,
2976         bool *must_disconnect)
2977 {
2978         bool map_is_switched, map_is_switch_protected, cs_violation;
2979         kern_return_t kr;
2980         /* Validate code signature if necessary. */
2981         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2982             (pmap == vm_map_pmap(current_thread()->map)));
2983         map_is_switch_protected = current_thread()->map->switch_protect;
2984         kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
2985             prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
2986             map_is_switched, map_is_switch_protected, &cs_violation);
2987         if (kr != KERN_SUCCESS) {
2988                 return kr;
2989         }
2990         if (cs_violation) {
2991                 kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
2992                     fault_page_size, fault_phys_offset,
2993                     map_is_switched, map_is_switch_protected, must_disconnect);
2994         }
2995         return kr;
2996 }
2997
2998 /*
2999  * Enqueue the page on the appropriate paging queue.
3000  */
3001 static void
3002 vm_fault_enqueue_page(
3003         vm_object_t object,
3004         vm_page_t m,
3005         bool wired,
3006         bool change_wiring,
3007         vm_tag_t wire_tag,
3008         bool no_cache,
3009         int *type_of_fault,
3010         kern_return_t kr)
3011 {
3012         assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
3013         boolean_t       page_queues_locked = FALSE;
3014         boolean_t       previously_pmapped = m->vmp_pmapped;
3015 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
3016 MACRO_BEGIN                                     \
3017         if (! page_queues_locked) {             \
3018                 page_queues_locked = TRUE;      \
3019                 vm_page_lockspin_queues();      \
3020         }                                       \
3021 MACRO_END
3022 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
3023 MACRO_BEGIN                                     \
3024         if (page_queues_locked) {               \
3025                 page_queues_locked = FALSE;     \
3026                 vm_page_unlock_queues();        \
3027         }                                       \
3028 MACRO_END
3029
3030 #if CONFIG_BACKGROUND_QUEUE
3031         vm_page_update_background_state(m);
3032 #endif
3033         if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3034                 /*
3035                  * Compressor pages are neither wired
3036                  * nor pageable and should never change.
3037                  */
3038                 assert(object == compressor_object);
3039         } else if (change_wiring) {
3040                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3041
3042                 if (wired) {
3043                         if (kr == KERN_SUCCESS) {
3044                                 vm_page_wire(m, wire_tag, TRUE);
3045                         }
3046                 } else {
3047                         vm_page_unwire(m, TRUE);
3048                 }
3049                 /* we keep the page queues lock, if we need it later */
3050         } else {
3051                 if (object->internal == TRUE) {
3052                         /*
3053                          * don't allow anonymous pages on
3054                          * the speculative queues
3055                          */
3056                         no_cache = FALSE;
3057                 }
3058                 if (kr != KERN_SUCCESS) {
3059                         __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3060                         vm_page_deactivate(m);
3061                         /* we keep the page queues lock, if we need it later */
3062                 } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3063                     (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3064                     (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3065                     ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3066                     !VM_PAGE_WIRED(m)) {
3067                         if (vm_page_local_q &&
3068                             (*type_of_fault == DBG_COW_FAULT ||
3069                             *type_of_fault == DBG_ZERO_FILL_FAULT)) {
3070                                 struct vpl      *lq;
3071                                 uint32_t        lid;
3072
3073                                 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3074
3075                                 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3076                                 vm_object_lock_assert_exclusive(object);
3077
3078                                 /*
3079                                  * we got a local queue to stuff this
3080                                  * new page on...
3081                                  * its safe to manipulate local and
3082                                  * local_id at this point since we're
3083                                  * behind an exclusive object lock and
3084                                  * the page is not on any global queue.
3085                                  *
3086                                  * we'll use the current cpu number to
3087                                  * select the queue note that we don't
3088                                  * need to disable preemption... we're
3089                                  * going to be behind the local queue's
3090                                  * lock to do the real work
3091                                  */
3092                                 lid = cpu_number();
3093
3094                                 lq = zpercpu_get_cpu(vm_page_local_q, lid);
3095
3096                                 VPL_LOCK(&lq->vpl_lock);
3097
3098                                 vm_page_check_pageable_safe(m);
3099                                 vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3100                                 m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3101                                 m->vmp_local_id = lid;
3102                                 lq->vpl_count++;
3103
3104                                 if (object->internal) {
3105                                         lq->vpl_internal_count++;
3106                                 } else {
3107                                         lq->vpl_external_count++;
3108                                 }
3109
3110                                 VPL_UNLOCK(&lq->vpl_lock);
3111
3112                                 if (lq->vpl_count > vm_page_local_q_soft_limit) {
3113                                         /*
3114                                          * we're beyond the soft limit
3115                                          * for the local queue
3116                                          * vm_page_reactivate_local will
3117                                          * 'try' to take the global page
3118                                          * queue lock... if it can't
3119                                          * that's ok... we'll let the
3120                                          * queue continue to grow up
3121                                          * to the hard limit... at that
3122                                          * point we'll wait for the
3123                                          * lock... once we've got the
3124                                          * lock, we'll transfer all of
3125                                          * the pages from the local
3126                                          * queue to the global active
3127                                          * queue
3128                                          */
3129                                         vm_page_reactivate_local(lid, FALSE, FALSE);
3130                                 }
3131                         } else {
3132                                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3133
3134                                 /*
3135                                  * test again now that we hold the
3136                                  * page queue lock
3137                                  */
3138                                 if (!VM_PAGE_WIRED(m)) {
3139                                         if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3140                                                 vm_page_queues_remove(m, FALSE);
3141
3142                                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3143                                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3144                                         }
3145
3146                                         if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3147                                             no_cache) {
3148                                                 /*
3149                                                  * If this is a no_cache mapping
3150                                                  * and the page has never been
3151                                                  * mapped before or was
3152                                                  * previously a no_cache page,
3153                                                  * then we want to leave pages
3154                                                  * in the speculative state so
3155                                                  * that they can be readily
3156                                                  * recycled if free memory runs
3157                                                  * low.  Otherwise the page is
3158                                                  * activated as normal.
3159                                                  */
3160
3161                                                 if (no_cache &&
3162                                                     (!previously_pmapped ||
3163                                                     m->vmp_no_cache)) {
3164                                                         m->vmp_no_cache = TRUE;
3165
3166                                                         if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3167                                                                 vm_page_speculate(m, FALSE);
3168                                                         }
3169                                                 } else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3170                                                         vm_page_activate(m);
3171                                                 }
3172                                         }
3173                                 }
3174                                 /* we keep the page queues lock, if we need it later */
3175                         }
3176                 }
3177         }
3178         /* we're done with the page queues lock, if we ever took it */
3179         __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3180 }
3181
3182 /*
3183  * Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
3184  * @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
3185  * before being inserted into the pmap.
3186  */
3187 static bool
3188 vm_fault_enter_set_mapped(
3189         vm_object_t object,
3190         vm_page_t m,
3191         vm_prot_t prot,
3192         vm_prot_t fault_type)
3193 {
3194         bool page_needs_sync = false;
3195         /*
3196          * NOTE: we may only hold the vm_object lock SHARED
3197          * at this point, so we need the phys_page lock to
3198          * properly serialize updating the pmapped and
3199          * xpmapped bits
3200          */
3201         if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3202                 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3203
3204                 pmap_lock_phys_page(phys_page);
3205                 m->vmp_pmapped = TRUE;
3206
3207                 if (!m->vmp_xpmapped) {
3208                         m->vmp_xpmapped = TRUE;
3209
3210                         pmap_unlock_phys_page(phys_page);
3211
3212                         if (!object->internal) {
3213                                 OSAddAtomic(1, &vm_page_xpmapped_external_count);
3214                         }
3215
3216 #if defined(__arm__) || defined(__arm64__)
3217                         page_needs_sync = true;
3218 #else
3219                         if (object->internal &&
3220                             object->pager != NULL) {
3221                                 /*
3222                                  * This page could have been
3223                                  * uncompressed by the
3224                                  * compressor pager and its
3225                                  * contents might be only in
3226                                  * the data cache.
3227                                  * Since it's being mapped for
3228                                  * "execute" for the fist time,
3229                                  * make sure the icache is in
3230                                  * sync.
3231                                  */
3232                                 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3233                                 page_needs_sync = true;
3234                         }
3235 #endif
3236                 } else {
3237                         pmap_unlock_phys_page(phys_page);
3238                 }
3239         } else {
3240                 if (m->vmp_pmapped == FALSE) {
3241                         ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3242
3243                         pmap_lock_phys_page(phys_page);
3244                         m->vmp_pmapped = TRUE;
3245                         pmap_unlock_phys_page(phys_page);
3246                 }
3247         }
3248
3249         if (fault_type & VM_PROT_WRITE) {
3250                 if (m->vmp_wpmapped == FALSE) {
3251                         vm_object_lock_assert_exclusive(object);
3252                         if (!object->internal && object->pager) {
3253                                 task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3254                         }
3255                         m->vmp_wpmapped = TRUE;
3256                 }
3257         }
3258         return page_needs_sync;
3259 }
3260
3261 /*
3262  * Try to enter the given page into the pmap.
3263  * Will retry without execute permission iff PMAP_CS is enabled and we encounter
3264  * a codesigning failure on a non-execute fault.
3265  */
3266 static kern_return_t
3267 vm_fault_attempt_pmap_enter(
3268         pmap_t pmap,
3269         vm_map_offset_t vaddr,
3270         vm_map_size_t fault_page_size,
3271         vm_map_offset_t fault_phys_offset,
3272         vm_page_t m,
3273         vm_prot_t *prot,
3274         vm_prot_t caller_prot,
3275         vm_prot_t fault_type,
3276         bool wired,
3277         int pmap_options)
3278 {
3279 #if !PMAP_CS
3280 #pragma unused(caller_prot)
3281 #endif /* !PMAP_CS */
3282         kern_return_t kr;
3283         if (fault_page_size != PAGE_SIZE) {
3284                 DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
3285                 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
3286                     fault_phys_offset < PAGE_SIZE),
3287                     "0x%llx\n", (uint64_t)fault_phys_offset);
3288         } else {
3289                 assertf(fault_phys_offset == 0,
3290                     "0x%llx\n", (uint64_t)fault_phys_offset);
3291         }
3292
3293         PMAP_ENTER_OPTIONS(pmap, vaddr,
3294             fault_phys_offset,
3295             m, *prot, fault_type, 0,
3296             wired,
3297             pmap_options,
3298             kr);
3299         return kr;
3300 }
3301
3302 /*
3303  * Enter the given page into the pmap.
3304  * The map must be locked shared.
3305  * The vm object must NOT be locked.
3306  *
3307  * @param need_retry if not null, avoid making a (potentially) blocking call into
3308  * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3309  */
3310 static kern_return_t
3311 vm_fault_pmap_enter(
3312         pmap_t pmap,
3313         vm_map_offset_t vaddr,
3314         vm_map_size_t fault_page_size,
3315         vm_map_offset_t fault_phys_offset,
3316         vm_page_t m,
3317         vm_prot_t *prot,
3318         vm_prot_t caller_prot,
3319         vm_prot_t fault_type,
3320         bool wired,
3321         int pmap_options,
3322         boolean_t *need_retry)
3323 {
3324         kern_return_t kr;
3325         if (need_retry != NULL) {
3326                 /*
3327                  * Although we don't hold a lock on this object, we hold a lock
3328                  * on the top object in the chain. To prevent a deadlock, we
3329                  * can't allow the pmap layer to block.
3330                  */
3331                 pmap_options |= PMAP_OPTIONS_NOWAIT;
3332         }
3333         kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3334             fault_page_size, fault_phys_offset,
3335             m, prot, caller_prot, fault_type, wired, pmap_options);
3336         if (kr == KERN_RESOURCE_SHORTAGE) {
3337                 if (need_retry) {
3338                         /*
3339                          * There's nothing we can do here since we hold the
3340                          * lock on the top object in the chain. The caller
3341                          * will need to deal with this by dropping that lock and retrying.
3342                          */
3343                         *need_retry = TRUE;
3344                         vm_pmap_enter_retried++;
3345                 }
3346         }
3347         return kr;
3348 }
3349
3350 /*
3351  * Enter the given page into the pmap.
3352  * The vm map must be locked shared.
3353  * The vm object must be locked exclusive, unless this is a soft fault.
3354  * For a soft fault, the object must be locked shared or exclusive.
3355  *
3356  * @param need_retry if not null, avoid making a (potentially) blocking call into
3357  * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3358  */
3359 static kern_return_t
3360 vm_fault_pmap_enter_with_object_lock(
3361         vm_object_t object,
3362         pmap_t pmap,
3363         vm_map_offset_t vaddr,
3364         vm_map_size_t fault_page_size,
3365         vm_map_offset_t fault_phys_offset,
3366         vm_page_t m,
3367         vm_prot_t *prot,
3368         vm_prot_t caller_prot,
3369         vm_prot_t fault_type,
3370         bool wired,
3371         int pmap_options,
3372         boolean_t *need_retry)
3373 {
3374         kern_return_t kr;
3375         /*
3376          * Prevent a deadlock by not
3377          * holding the object lock if we need to wait for a page in
3378          * pmap_enter() - <rdar://problem/7138958>
3379          */
3380         kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3381             fault_page_size, fault_phys_offset,
3382             m, prot, caller_prot, fault_type, wired, pmap_options | PMAP_OPTIONS_NOWAIT);
3383 #if __x86_64__
3384         if (kr == KERN_INVALID_ARGUMENT &&
3385             pmap == PMAP_NULL &&
3386             wired) {
3387                 /*
3388                  * Wiring a page in a pmap-less VM map:
3389                  * VMware's "vmmon" kernel extension does this
3390                  * to grab pages.
3391                  * Let it proceed even though the PMAP_ENTER() failed.
3392                  */
3393                 kr = KERN_SUCCESS;
3394         }
3395 #endif /* __x86_64__ */
3396
3397         if (kr == KERN_RESOURCE_SHORTAGE) {
3398                 if (need_retry) {
3399                         /*
3400                          * this will be non-null in the case where we hold the lock
3401                          * on the top-object in this chain... we can't just drop
3402                          * the lock on the object we're inserting the page into
3403                          * and recall the PMAP_ENTER since we can still cause
3404                          * a deadlock if one of the critical paths tries to
3405                          * acquire the lock on the top-object and we're blocked
3406                          * in PMAP_ENTER waiting for memory... our only recourse
3407                          * is to deal with it at a higher level where we can
3408                          * drop both locks.
3409                          */
3410                         *need_retry = TRUE;
3411                         vm_pmap_enter_retried++;
3412                         goto done;
3413                 }
3414                 /*
3415                  * The nonblocking version of pmap_enter did not succeed.
3416                  * and we don't need to drop other locks and retry
3417                  * at the level above us, so
3418                  * use the blocking version instead. Requires marking
3419                  * the page busy and unlocking the object
3420                  */
3421                 boolean_t was_busy = m->vmp_busy;
3422
3423                 vm_object_lock_assert_exclusive(object);
3424
3425                 m->vmp_busy = TRUE;
3426                 vm_object_unlock(object);
3427
3428                 PMAP_ENTER_OPTIONS(pmap, vaddr,
3429                     fault_phys_offset,
3430                     m, *prot, fault_type,
3431                     0, wired,
3432                     pmap_options, kr);
3433
3434                 assert(VM_PAGE_OBJECT(m) == object);
3435
3436                 /* Take the object lock again. */
3437                 vm_object_lock(object);
3438
3439                 /* If the page was busy, someone else will wake it up.
3440                  * Otherwise, we have to do it now. */
3441                 assert(m->vmp_busy);
3442                 if (!was_busy) {
3443                         PAGE_WAKEUP_DONE(m);
3444                 }
3445                 vm_pmap_enter_blocked++;
3446         }
3447
3448 done:
3449         return kr;
3450 }
3451
3452 /*
3453  * Prepare to enter a page into the pmap by checking CS, protection bits,
3454  * and setting mapped bits on the page_t.
3455  * Does not modify the page's paging queue.
3456  *
3457  * page queue lock must NOT be held
3458  * m->vmp_object must be locked
3459  *
3460  * NOTE: m->vmp_object could be locked "shared" only if we are called
3461  * from vm_fault() as part of a soft fault.
3462  */
3463 static kern_return_t
3464 vm_fault_enter_prepare(
3465         vm_page_t m,
3466         pmap_t pmap,
3467         vm_map_offset_t vaddr,
3468         vm_prot_t *prot,
3469         vm_prot_t caller_prot,
3470         vm_map_size_t fault_page_size,
3471         vm_map_offset_t fault_phys_offset,
3472         boolean_t change_wiring,
3473         vm_prot_t fault_type,
3474         vm_object_fault_info_t fault_info,
3475         int *type_of_fault,
3476         bool *page_needs_data_sync)
3477 {
3478         kern_return_t   kr;
3479         bool            is_tainted = false;
3480         vm_object_t     object;
3481         boolean_t       cs_bypass = fault_info->cs_bypass;
3482
3483         object = VM_PAGE_OBJECT(m);
3484
3485         vm_object_lock_assert_held(object);
3486
3487 #if KASAN
3488         if (pmap == kernel_pmap) {
3489                 kasan_notify_address(vaddr, PAGE_SIZE);
3490         }
3491 #endif
3492
3493         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3494
3495         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
3496                 vm_object_lock_assert_exclusive(object);
3497         } else if ((fault_type & VM_PROT_WRITE) == 0 &&
3498             !change_wiring &&
3499             (!m->vmp_wpmapped
3500 #if VM_OBJECT_ACCESS_TRACKING
3501             || object->access_tracking
3502 #endif /* VM_OBJECT_ACCESS_TRACKING */
3503             )) {
3504                 /*
3505                  * This is not a "write" fault, so we
3506                  * might not have taken the object lock
3507                  * exclusively and we might not be able
3508                  * to update the "wpmapped" bit in
3509                  * vm_fault_enter().
3510                  * Let's just grant read access to
3511                  * the page for now and we'll
3512                  * soft-fault again if we need write
3513                  * access later...
3514                  */
3515
3516                 /* This had better not be a JIT page. */
3517                 if (!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
3518                         *prot &= ~VM_PROT_WRITE;
3519                 } else {
3520                         assert(cs_bypass);
3521                 }
3522         }
3523         if (m->vmp_pmapped == FALSE) {
3524                 if (m->vmp_clustered) {
3525                         if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
3526                                 /*
3527                                  * found it in the cache, but this
3528                                  * is the first fault-in of the page (m->vmp_pmapped == FALSE)
3529                                  * so it must have come in as part of
3530                                  * a cluster... account 1 pagein against it
3531                                  */
3532                                 if (object->internal) {
3533                                         *type_of_fault = DBG_PAGEIND_FAULT;
3534                                 } else {
3535                                         *type_of_fault = DBG_PAGEINV_FAULT;
3536                                 }
3537
3538                                 VM_PAGE_COUNT_AS_PAGEIN(m);
3539                         }
3540                         VM_PAGE_CONSUME_CLUSTERED(m);
3541                 }
3542         }
3543
3544         if (*type_of_fault != DBG_COW_FAULT) {
3545                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
3546
3547                 if (pmap == kernel_pmap) {
3548                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
3549                 }
3550         }
3551
3552         kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
3553             *prot, caller_prot, fault_page_size, fault_phys_offset,
3554             fault_info, &is_tainted);
3555         if (kr == KERN_SUCCESS) {
3556                 /*
3557                  * We either have a good page, or a tainted page that has been accepted by the process.
3558                  * In both cases the page will be entered into the pmap.
3559                  */
3560                 *page_needs_data_sync = vm_fault_enter_set_mapped(object, m, *prot, fault_type);
3561                 if ((fault_type & VM_PROT_WRITE) && is_tainted) {
3562                         /*
3563                          * This page is tainted but we're inserting it anyways.
3564                          * Since it's writeable, we need to disconnect it from other pmaps
3565                          * now so those processes can take note.
3566                          */
3567
3568                         /*
3569                          * We can only get here
3570                          * because of the CSE logic
3571                          */
3572                         assert(pmap_get_vm_map_cs_enforced(pmap));
3573                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3574                         /*
3575                          * If we are faulting for a write, we can clear
3576                          * the execute bit - that will ensure the page is
3577                          * checked again before being executable, which
3578                          * protects against a map switch.
3579                          * This only happens the first time the page
3580                          * gets tainted, so we won't get stuck here
3581                          * to make an already writeable page executable.
3582                          */
3583                         if (!cs_bypass) {
3584                                 assert(!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot));
3585                                 *prot &= ~VM_PROT_EXECUTE;
3586                         }
3587                 }
3588                 assert(VM_PAGE_OBJECT(m) == object);
3589
3590 #if VM_OBJECT_ACCESS_TRACKING
3591                 if (object->access_tracking) {
3592                         DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3593                         if (fault_type & VM_PROT_WRITE) {
3594                                 object->access_tracking_writes++;
3595                                 vm_object_access_tracking_writes++;
3596                         } else {
3597                                 object->access_tracking_reads++;
3598                                 vm_object_access_tracking_reads++;
3599                         }
3600                 }
3601 #endif /* VM_OBJECT_ACCESS_TRACKING */
3602         }
3603
3604         return kr;
3605 }
3606
3607 /*
3608  * page queue lock must NOT be held
3609  * m->vmp_object must be locked
3610  *
3611  * NOTE: m->vmp_object could be locked "shared" only if we are called
3612  * from vm_fault() as part of a soft fault.  If so, we must be
3613  * careful not to modify the VM object in any way that is not
3614  * legal under a shared lock...
3615  */
3616 kern_return_t
3617 vm_fault_enter(
3618         vm_page_t m,
3619         pmap_t pmap,
3620         vm_map_offset_t vaddr,
3621         vm_map_size_t fault_page_size,
3622         vm_map_offset_t fault_phys_offset,
3623         vm_prot_t prot,
3624         vm_prot_t caller_prot,
3625         boolean_t wired,
3626         boolean_t change_wiring,
3627         vm_tag_t  wire_tag,
3628         vm_object_fault_info_t fault_info,
3629         boolean_t *need_retry,
3630         int *type_of_fault)
3631 {
3632         kern_return_t   kr;
3633         vm_object_t     object;
3634         bool            page_needs_data_sync;
3635         vm_prot_t       fault_type;
3636         int             pmap_options = fault_info->pmap_options;
3637
3638         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3639                 assert(m->vmp_fictitious);
3640                 return KERN_SUCCESS;
3641         }
3642
3643         fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
3644
3645         kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
3646             fault_page_size, fault_phys_offset, change_wiring, fault_type,
3647             fault_info, type_of_fault, &page_needs_data_sync);
3648         object = VM_PAGE_OBJECT(m);
3649
3650         vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
3651
3652         if (kr == KERN_SUCCESS) {
3653                 if (page_needs_data_sync) {
3654                         pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
3655                 }
3656
3657                 kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
3658                     fault_page_size, fault_phys_offset, m,
3659                     &prot, caller_prot, fault_type, wired, pmap_options, need_retry);
3660         }
3661
3662         return kr;
3663 }
3664
3665 void
3666 vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
3667 {
3668         if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3669                 vm_fault(current_map(),      /* map */
3670                     vaddr,                   /* vaddr */
3671                     prot,                    /* fault_type */
3672                     FALSE,                   /* change_wiring */
3673                     VM_KERN_MEMORY_NONE,     /* tag - not wiring */
3674                     THREAD_UNINT,            /* interruptible */
3675                     NULL,                    /* caller_pmap */
3676                     0 /* caller_pmap_addr */);
3677         }
3678 }
3679
3680
3681 /*
3682  *      Routine:        vm_fault
3683  *      Purpose:
3684  *              Handle page faults, including pseudo-faults
3685  *              used to change the wiring status of pages.
3686  *      Returns:
3687  *              Explicit continuations have been removed.
3688  *      Implementation:
3689  *              vm_fault and vm_fault_page save mucho state
3690  *              in the moral equivalent of a closure.  The state
3691  *              structure is allocated when first entering vm_fault
3692  *              and deallocated when leaving vm_fault.
3693  */
3694
3695 extern uint64_t get_current_unique_pid(void);
3696
3697 unsigned long vm_fault_collapse_total = 0;
3698 unsigned long vm_fault_collapse_skipped = 0;
3699
3700
3701 kern_return_t
3702 vm_fault_external(
3703         vm_map_t        map,
3704         vm_map_offset_t vaddr,
3705         vm_prot_t       fault_type,
3706         boolean_t       change_wiring,
3707         int             interruptible,
3708         pmap_t          caller_pmap,
3709         vm_map_offset_t caller_pmap_addr)
3710 {
3711         return vm_fault_internal(map, vaddr, fault_type, change_wiring,
3712                    change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
3713                    interruptible, caller_pmap, caller_pmap_addr,
3714                    NULL);
3715 }
3716
3717 kern_return_t
3718 vm_fault(
3719         vm_map_t        map,
3720         vm_map_offset_t vaddr,
3721         vm_prot_t       fault_type,
3722         boolean_t       change_wiring,
3723         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3724         int             interruptible,
3725         pmap_t          caller_pmap,
3726         vm_map_offset_t caller_pmap_addr)
3727 {
3728         return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3729                    interruptible, caller_pmap, caller_pmap_addr,
3730                    NULL);
3731 }
3732
3733 static boolean_t
3734 current_proc_is_privileged(void)
3735 {
3736         return csproc_get_platform_binary(current_proc());
3737 }
3738
3739 uint64_t vm_copied_on_read = 0;
3740
3741 /*
3742  * Cleanup after a vm_fault_enter.
3743  * At this point, the fault should either have failed (kr != KERN_SUCCESS)
3744  * or the page should be in the pmap and on the correct paging queue.
3745  *
3746  * Precondition:
3747  * map must be locked shared.
3748  * m_object must be locked.
3749  * If top_object != VM_OBJECT_NULL, it must be locked.
3750  * real_map must be locked.
3751  *
3752  * Postcondition:
3753  * map will be unlocked
3754  * m_object will be unlocked
3755  * top_object will be unlocked
3756  * If real_map != map, it will be unlocked
3757  */
3758 static void
3759 vm_fault_complete(
3760         vm_map_t map,
3761         vm_map_t real_map,
3762         vm_object_t object,
3763         vm_object_t m_object,
3764         vm_page_t m,
3765         vm_map_offset_t offset,
3766         vm_map_offset_t trace_real_vaddr,
3767         vm_object_fault_info_t fault_info,
3768         vm_prot_t caller_prot,
3769 #if CONFIG_DTRACE
3770         vm_map_offset_t real_vaddr,
3771 #else
3772         __unused vm_map_offset_t real_vaddr,
3773 #endif /* CONFIG_DTRACE */
3774         int type_of_fault,
3775         boolean_t need_retry,
3776         kern_return_t kr,
3777         ppnum_t *physpage_p,
3778         vm_prot_t prot,
3779         vm_object_t top_object,
3780         boolean_t need_collapse,
3781         vm_map_offset_t cur_offset,
3782         vm_prot_t fault_type,
3783         vm_object_t *written_on_object,
3784         memory_object_t *written_on_pager,
3785         vm_object_offset_t *written_on_offset)
3786 {
3787         int     event_code = 0;
3788         vm_map_lock_assert_shared(map);
3789         vm_object_lock_assert_held(m_object);
3790         if (top_object != VM_OBJECT_NULL) {
3791                 vm_object_lock_assert_held(top_object);
3792         }
3793         vm_map_lock_assert_held(real_map);
3794
3795         if (m_object->internal) {
3796                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
3797         } else if (m_object->object_is_shared_cache) {
3798                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
3799         } else {
3800                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
3801         }
3802
3803         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
3804         if (need_retry == FALSE) {
3805                 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid(), 0, 0, 0, 0);
3806         }
3807         DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
3808         if (kr == KERN_SUCCESS &&
3809             physpage_p != NULL) {
3810                 /* for vm_map_wire_and_extract() */
3811                 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
3812                 if (prot & VM_PROT_WRITE) {
3813                         vm_object_lock_assert_exclusive(m_object);
3814                         m->vmp_dirty = TRUE;
3815                 }
3816         }
3817
3818         if (top_object != VM_OBJECT_NULL) {
3819                 /*
3820                  * It's safe to drop the top object
3821                  * now that we've done our
3822                  * vm_fault_enter().  Any other fault
3823                  * in progress for that virtual
3824                  * address will either find our page
3825                  * and translation or put in a new page
3826                  * and translation.
3827                  */
3828                 vm_object_unlock(top_object);
3829                 top_object = VM_OBJECT_NULL;
3830         }
3831
3832         if (need_collapse == TRUE) {
3833                 vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
3834         }
3835
3836         if (need_retry == FALSE &&
3837             (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3838                 /*
3839                  * evaluate access pattern and update state
3840                  * vm_fault_deactivate_behind depends on the
3841                  * state being up to date
3842                  */
3843                 vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
3844
3845                 vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
3846         }
3847         /*
3848          * That's it, clean up and return.
3849          */
3850         if (m->vmp_busy) {
3851                 vm_object_lock_assert_exclusive(m_object);
3852                 PAGE_WAKEUP_DONE(m);
3853         }
3854
3855         if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
3856                 vm_object_paging_begin(m_object);
3857
3858                 assert(*written_on_object == VM_OBJECT_NULL);
3859                 *written_on_object = m_object;
3860                 *written_on_pager = m_object->pager;
3861                 *written_on_offset = m_object->paging_offset + m->vmp_offset;
3862         }
3863         vm_object_unlock(object);
3864
3865         vm_map_unlock_read(map);
3866         if (real_map != map) {
3867                 vm_map_unlock(real_map);
3868         }
3869 }
3870
3871 static inline int
3872 vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
3873 {
3874         if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
3875                 return DBG_COR_FAULT;
3876         }
3877         return type_of_fault;
3878 }
3879
3880 kern_return_t
3881 vm_fault_internal(
3882         vm_map_t        map,
3883         vm_map_offset_t vaddr,
3884         vm_prot_t       caller_prot,
3885         boolean_t       change_wiring,
3886         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3887         int             interruptible,
3888         pmap_t          caller_pmap,
3889         vm_map_offset_t caller_pmap_addr,
3890         ppnum_t         *physpage_p)
3891 {
3892         vm_map_version_t        version;        /* Map version for verificiation */
3893         boolean_t               wired;          /* Should mapping be wired down? */
3894         vm_object_t             object;         /* Top-level object */
3895         vm_object_offset_t      offset;         /* Top-level offset */
3896         vm_prot_t               prot;           /* Protection for mapping */
3897         vm_object_t             old_copy_object; /* Saved copy object */
3898         vm_page_t               result_page;    /* Result of vm_fault_page */
3899         vm_page_t               top_page;       /* Placeholder page */
3900         kern_return_t           kr;
3901
3902         vm_page_t               m;      /* Fast access to result_page */
3903         kern_return_t           error_code;
3904         vm_object_t             cur_object;
3905         vm_object_t             m_object = NULL;
3906         vm_object_offset_t      cur_offset;
3907         vm_page_t               cur_m;
3908         vm_object_t             new_object;
3909         int                     type_of_fault;
3910         pmap_t                  pmap;
3911         wait_interrupt_t        interruptible_state;
3912         vm_map_t                real_map = map;
3913         vm_map_t                original_map = map;
3914         bool                    object_locks_dropped = FALSE;
3915         vm_prot_t               fault_type;
3916         vm_prot_t               original_fault_type;
3917         struct vm_object_fault_info fault_info = {};
3918         bool                    need_collapse = FALSE;
3919         boolean_t               need_retry = FALSE;
3920         boolean_t               *need_retry_ptr = NULL;
3921         uint8_t                 object_lock_type = 0;
3922         uint8_t                 cur_object_lock_type;
3923         vm_object_t             top_object = VM_OBJECT_NULL;
3924         vm_object_t             written_on_object = VM_OBJECT_NULL;
3925         memory_object_t         written_on_pager = NULL;
3926         vm_object_offset_t      written_on_offset = 0;
3927         int                     throttle_delay;
3928         int                     compressed_count_delta;
3929         uint8_t                 grab_options;
3930         bool                    need_copy;
3931         bool                    need_copy_on_read;
3932         vm_map_offset_t         trace_vaddr;
3933         vm_map_offset_t         trace_real_vaddr;
3934         vm_map_size_t           fault_page_size;
3935         vm_map_size_t           fault_page_mask;
3936         vm_map_offset_t         fault_phys_offset;
3937         vm_map_offset_t         real_vaddr;
3938         bool                    resilient_media_retry = FALSE;
3939         vm_object_t             resilient_media_object = VM_OBJECT_NULL;
3940         vm_object_offset_t      resilient_media_offset = (vm_object_offset_t)-1;
3941         bool                    page_needs_data_sync = false;
3942         /*
3943          * Was the VM object contended when vm_map_lookup_locked locked it?
3944          * If so, the zero fill path will drop the lock
3945          * NB: Ideally we would always drop the lock rather than rely on
3946          * this heuristic, but vm_object_unlock currently takes > 30 cycles.
3947          */
3948         bool                    object_is_contended = false;
3949
3950         real_vaddr = vaddr;
3951         trace_real_vaddr = vaddr;
3952
3953         if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
3954                 fault_phys_offset = (vm_map_offset_t)-1;
3955                 fault_page_size = VM_MAP_PAGE_SIZE(original_map);
3956                 fault_page_mask = VM_MAP_PAGE_MASK(original_map);
3957                 if (fault_page_size < PAGE_SIZE) {
3958                         DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
3959                         vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
3960                 }
3961         } else {
3962                 fault_phys_offset = 0;
3963                 fault_page_size = PAGE_SIZE;
3964                 fault_page_mask = PAGE_MASK;
3965                 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
3966         }
3967
3968         if (map == kernel_map) {
3969                 trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
3970                 trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
3971         } else {
3972                 trace_vaddr = vaddr;
3973         }
3974
3975         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3976             (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3977             ((uint64_t)trace_vaddr >> 32),
3978             trace_vaddr,
3979             (map == kernel_map),
3980             0,
3981             0);
3982
3983         if (get_preemption_level() != 0) {
3984                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3985                     (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3986                     ((uint64_t)trace_vaddr >> 32),
3987                     trace_vaddr,
3988                     KERN_FAILURE,
3989                     0,
3990                     0);
3991
3992                 return KERN_FAILURE;
3993         }
3994
3995         thread_t cthread = current_thread();
3996         bool      rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
3997         uint64_t fstart = 0;
3998
3999         if (rtfault) {
4000                 fstart = mach_continuous_time();
4001         }
4002
4003         interruptible_state = thread_interrupt_level(interruptible);
4004
4005         fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
4006
4007         VM_STAT_INCR(faults);
4008         current_task()->faults++;
4009         original_fault_type = fault_type;
4010
4011         need_copy = FALSE;
4012         if (fault_type & VM_PROT_WRITE) {
4013                 need_copy = TRUE;
4014         }
4015
4016         if (need_copy || change_wiring) {
4017                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4018         } else {
4019                 object_lock_type = OBJECT_LOCK_SHARED;
4020         }
4021
4022         cur_object_lock_type = OBJECT_LOCK_SHARED;
4023
4024         if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
4025                 if (compressor_map) {
4026                         if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
4027                                 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
4028                         }
4029                 }
4030         }
4031 RetryFault:
4032         assert(written_on_object == VM_OBJECT_NULL);
4033
4034         /*
4035          * assume we will hit a page in the cache
4036          * otherwise, explicitly override with
4037          * the real fault type once we determine it
4038          */
4039         type_of_fault = DBG_CACHE_HIT_FAULT;
4040
4041         /*
4042          *      Find the backing store object and offset into
4043          *      it to begin the search.
4044          */
4045         fault_type = original_fault_type;
4046         map = original_map;
4047         vm_map_lock_read(map);
4048
4049         if (resilient_media_retry) {
4050                 /*
4051                  * If we have to insert a fake zero-filled page to hide
4052                  * a media failure to provide the real page, we need to
4053                  * resolve any pending copy-on-write on this mapping.
4054                  * VM_PROT_COPY tells vm_map_lookup_locked() to deal
4055                  * with that even if this is not a "write" fault.
4056                  */
4057                 need_copy = TRUE;
4058                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4059         }
4060
4061         kr = vm_map_lookup_locked(&map, vaddr,
4062             (fault_type | (need_copy ? VM_PROT_COPY : 0)),
4063             object_lock_type, &version,
4064             &object, &offset, &prot, &wired,
4065             &fault_info,
4066             &real_map,
4067             &object_is_contended);
4068
4069         if (kr != KERN_SUCCESS) {
4070                 vm_map_unlock_read(map);
4071                 goto done;
4072         }
4073
4074
4075         pmap = real_map->pmap;
4076         fault_info.interruptible = interruptible;
4077         fault_info.stealth = FALSE;
4078         fault_info.io_sync = FALSE;
4079         fault_info.mark_zf_absent = FALSE;
4080         fault_info.batch_pmap_op = FALSE;
4081
4082         if (resilient_media_retry) {
4083                 /*
4084                  * We're retrying this fault after having detected a media
4085                  * failure from a "resilient_media" mapping.
4086                  * Check that the mapping is still pointing at the object
4087                  * that just failed to provide a page.
4088                  */
4089                 assert(resilient_media_object != VM_OBJECT_NULL);
4090                 assert(resilient_media_offset != (vm_object_offset_t)-1);
4091                 if (object != VM_OBJECT_NULL &&
4092                     object == resilient_media_object &&
4093                     offset == resilient_media_offset &&
4094                     fault_info.resilient_media) {
4095                         /*
4096                          * This mapping still points at the same object
4097                          * and is still "resilient_media": proceed in
4098                          * "recovery-from-media-failure" mode, where we'll
4099                          * insert a zero-filled page in the top object.
4100                          */
4101 //                     printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
4102                 } else {
4103                         /* not recovering: reset state */
4104 //                     printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
4105                         resilient_media_retry = FALSE;
4106                         /* release our extra reference on failed object */
4107 //                     printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4108                         vm_object_deallocate(resilient_media_object);
4109                         resilient_media_object = VM_OBJECT_NULL;
4110                         resilient_media_offset = (vm_object_offset_t)-1;
4111                 }
4112         } else {
4113                 assert(resilient_media_object == VM_OBJECT_NULL);
4114                 resilient_media_offset = (vm_object_offset_t)-1;
4115         }
4116
4117         /*
4118          * If the page is wired, we must fault for the current protection
4119          * value, to avoid further faults.
4120          */
4121         if (wired) {
4122                 fault_type = prot | VM_PROT_WRITE;
4123         }
4124         if (wired || need_copy) {
4125                 /*
4126                  * since we're treating this fault as a 'write'
4127                  * we must hold the top object lock exclusively
4128                  */
4129                 if (object_lock_type == OBJECT_LOCK_SHARED) {
4130                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4131
4132                         if (vm_object_lock_upgrade(object) == FALSE) {
4133                                 /*
4134                                  * couldn't upgrade, so explictly
4135                                  * take the lock exclusively
4136                                  */
4137                                 vm_object_lock(object);
4138                         }
4139                 }
4140         }
4141
4142 #if     VM_FAULT_CLASSIFY
4143         /*
4144          *      Temporary data gathering code
4145          */
4146         vm_fault_classify(object, offset, fault_type);
4147 #endif
4148         /*
4149          *      Fast fault code.  The basic idea is to do as much as
4150          *      possible while holding the map lock and object locks.
4151          *      Busy pages are not used until the object lock has to
4152          *      be dropped to do something (copy, zero fill, pmap enter).
4153          *      Similarly, paging references aren't acquired until that
4154          *      point, and object references aren't used.
4155          *
4156          *      If we can figure out what to do
4157          *      (zero fill, copy on write, pmap enter) while holding
4158          *      the locks, then it gets done.  Otherwise, we give up,
4159          *      and use the original fault path (which doesn't hold
4160          *      the map lock, and relies on busy pages).
4161          *      The give up cases include:
4162          *              - Have to talk to pager.
4163          *              - Page is busy, absent or in error.
4164          *              - Pager has locked out desired access.
4165          *              - Fault needs to be restarted.
4166          *              - Have to push page into copy object.
4167          *
4168          *      The code is an infinite loop that moves one level down
4169          *      the shadow chain each time.  cur_object and cur_offset
4170          *      refer to the current object being examined. object and offset
4171          *      are the original object from the map.  The loop is at the
4172          *      top level if and only if object and cur_object are the same.
4173          *
4174          *      Invariants:  Map lock is held throughout.  Lock is held on
4175          *              original object and cur_object (if different) when
4176          *              continuing or exiting loop.
4177          *
4178          */
4179
4180 #if defined(__arm64__)
4181         /*
4182          * Fail if reading an execute-only page in a
4183          * pmap that enforces execute-only protection.
4184          */
4185         if (fault_type == VM_PROT_READ &&
4186             (prot & VM_PROT_EXECUTE) &&
4187             !(prot & VM_PROT_READ) &&
4188             pmap_enforces_execute_only(pmap)) {
4189                 vm_object_unlock(object);
4190                 vm_map_unlock_read(map);
4191                 if (real_map != map) {
4192                         vm_map_unlock(real_map);
4193                 }
4194                 kr = KERN_PROTECTION_FAILURE;
4195                 goto done;
4196         }
4197 #endif
4198
4199         fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
4200
4201         /*
4202          * If this page is to be inserted in a copy delay object
4203          * for writing, and if the object has a copy, then the
4204          * copy delay strategy is implemented in the slow fault page.
4205          */
4206         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
4207             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
4208                 goto handle_copy_delay;
4209         }
4210
4211         cur_object = object;
4212         cur_offset = offset;
4213
4214         grab_options = 0;
4215 #if CONFIG_SECLUDED_MEMORY
4216         if (object->can_grab_secluded) {
4217                 grab_options |= VM_PAGE_GRAB_SECLUDED;
4218         }
4219 #endif /* CONFIG_SECLUDED_MEMORY */
4220
4221         while (TRUE) {
4222                 if (!cur_object->pager_created &&
4223                     cur_object->phys_contiguous) { /* superpage */
4224                         break;
4225                 }
4226
4227                 if (cur_object->blocked_access) {
4228                         /*
4229                          * Access to this VM object has been blocked.
4230                          * Let the slow path handle it.
4231                          */
4232                         break;
4233                 }
4234
4235                 m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
4236                 m_object = NULL;
4237
4238                 if (m != VM_PAGE_NULL) {
4239                         m_object = cur_object;
4240
4241                         if (m->vmp_busy) {
4242                                 wait_result_t   result;
4243
4244                                 /*
4245                                  * in order to do the PAGE_ASSERT_WAIT, we must
4246                                  * have object that 'm' belongs to locked exclusively
4247                                  */
4248                                 if (object != cur_object) {
4249                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4250                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4251
4252                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
4253                                                         /*
4254                                                          * couldn't upgrade so go do a full retry
4255                                                          * immediately since we can no longer be
4256                                                          * certain about cur_object (since we
4257                                                          * don't hold a reference on it)...
4258                                                          * first drop the top object lock
4259                                                          */
4260                                                         vm_object_unlock(object);
4261
4262                                                         vm_map_unlock_read(map);
4263                                                         if (real_map != map) {
4264                                                                 vm_map_unlock(real_map);
4265                                                         }
4266
4267                                                         goto RetryFault;
4268                                                 }
4269                                         }
4270                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4271                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4272
4273                                         if (vm_object_lock_upgrade(object) == FALSE) {
4274                                                 /*
4275                                                  * couldn't upgrade, so explictly take the lock
4276                                                  * exclusively and go relookup the page since we
4277                                                  * will have dropped the object lock and
4278                                                  * a different thread could have inserted
4279                                                  * a page at this offset
4280                                                  * no need for a full retry since we're
4281                                                  * at the top level of the object chain
4282                                                  */
4283                                                 vm_object_lock(object);
4284
4285                                                 continue;
4286                                         }
4287                                 }
4288                                 if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
4289                                         /*
4290                                          * m->vmp_busy == TRUE and the object is locked exclusively
4291                                          * if m->pageout_queue == TRUE after we acquire the
4292                                          * queues lock, we are guaranteed that it is stable on
4293                                          * the pageout queue and therefore reclaimable
4294                                          *
4295                                          * NOTE: this is only true for the internal pageout queue
4296                                          * in the compressor world
4297                                          */
4298                                         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
4299
4300                                         vm_page_lock_queues();
4301
4302                                         if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
4303                                                 vm_pageout_throttle_up(m);
4304                                                 vm_page_unlock_queues();
4305
4306                                                 PAGE_WAKEUP_DONE(m);
4307                                                 goto reclaimed_from_pageout;
4308                                         }
4309                                         vm_page_unlock_queues();
4310                                 }
4311                                 if (object != cur_object) {
4312                                         vm_object_unlock(object);
4313                                 }
4314
4315                                 vm_map_unlock_read(map);
4316                                 if (real_map != map) {
4317                                         vm_map_unlock(real_map);
4318                                 }
4319
4320                                 result = PAGE_ASSERT_WAIT(m, interruptible);
4321
4322                                 vm_object_unlock(cur_object);
4323
4324                                 if (result == THREAD_WAITING) {
4325                                         result = thread_block(THREAD_CONTINUE_NULL);
4326
4327                                         counter(c_vm_fault_page_block_busy_kernel++);
4328                                 }
4329                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
4330                                         goto RetryFault;
4331                                 }
4332
4333                                 kr = KERN_ABORTED;
4334                                 goto done;
4335                         }
4336 reclaimed_from_pageout:
4337                         if (m->vmp_laundry) {
4338                                 if (object != cur_object) {
4339                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4340                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4341
4342                                                 vm_object_unlock(object);
4343                                                 vm_object_unlock(cur_object);
4344
4345                                                 vm_map_unlock_read(map);
4346                                                 if (real_map != map) {
4347                                                         vm_map_unlock(real_map);
4348                                                 }
4349
4350                                                 goto RetryFault;
4351                                         }
4352                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4353                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4354
4355                                         if (vm_object_lock_upgrade(object) == FALSE) {
4356                                                 /*
4357                                                  * couldn't upgrade, so explictly take the lock
4358                                                  * exclusively and go relookup the page since we
4359                                                  * will have dropped the object lock and
4360                                                  * a different thread could have inserted
4361                                                  * a page at this offset
4362                                                  * no need for a full retry since we're
4363                                                  * at the top level of the object chain
4364                                                  */
4365                                                 vm_object_lock(object);
4366
4367                                                 continue;
4368                                         }
4369                                 }
4370                                 vm_pageout_steal_laundry(m, FALSE);
4371                         }
4372
4373                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
4374                                 /*
4375                                  * Guard page: let the slow path deal with it
4376                                  */
4377                                 break;
4378                         }
4379                         if (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
4380                                 /*
4381                                  * Unusual case... let the slow path deal with it
4382                                  */
4383                                 break;
4384                         }
4385                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
4386                                 if (object != cur_object) {
4387                                         vm_object_unlock(object);
4388                                 }
4389                                 vm_map_unlock_read(map);
4390                                 if (real_map != map) {
4391                                         vm_map_unlock(real_map);
4392                                 }
4393                                 vm_object_unlock(cur_object);
4394                                 kr = KERN_MEMORY_ERROR;
4395                                 goto done;
4396                         }
4397                         assert(m_object == VM_PAGE_OBJECT(m));
4398
4399                         if (vm_fault_cs_need_validation(map->pmap, m, m_object,
4400                             PAGE_SIZE, 0) ||
4401                             (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
4402 upgrade_lock_and_retry:
4403                                 /*
4404                                  * We might need to validate this page
4405                                  * against its code signature, so we
4406                                  * want to hold the VM object exclusively.
4407                                  */
4408                                 if (object != cur_object) {
4409                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4410                                                 vm_object_unlock(object);
4411                                                 vm_object_unlock(cur_object);
4412
4413                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4414
4415                                                 vm_map_unlock_read(map);
4416                                                 if (real_map != map) {
4417                                                         vm_map_unlock(real_map);
4418                                                 }
4419
4420                                                 goto RetryFault;
4421                                         }
4422                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4423                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4424
4425                                         if (vm_object_lock_upgrade(object) == FALSE) {
4426                                                 /*
4427                                                  * couldn't upgrade, so explictly take the lock
4428                                                  * exclusively and go relookup the page since we
4429                                                  * will have dropped the object lock and
4430                                                  * a different thread could have inserted
4431                                                  * a page at this offset
4432                                                  * no need for a full retry since we're
4433                                                  * at the top level of the object chain
4434                                                  */
4435                                                 vm_object_lock(object);
4436
4437                                                 continue;
4438                                         }
4439                                 }
4440                         }
4441                         /*
4442                          *      Two cases of map in faults:
4443                          *          - At top level w/o copy object.
4444                          *          - Read fault anywhere.
4445                          *              --> must disallow write.
4446                          */
4447
4448                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
4449                                 goto FastPmapEnter;
4450                         }
4451
4452                         if (!need_copy &&
4453                             !fault_info.no_copy_on_read &&
4454                             cur_object != object &&
4455                             !cur_object->internal &&
4456                             !cur_object->pager_trusted &&
4457                             vm_protect_privileged_from_untrusted &&
4458                             !((prot & VM_PROT_EXECUTE) &&
4459                             cur_object->code_signed &&
4460                             pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
4461                             current_proc_is_privileged()) {
4462                                 /*
4463                                  * We're faulting on a page in "object" and
4464                                  * went down the shadow chain to "cur_object"
4465                                  * to find out that "cur_object"'s pager
4466                                  * is not "trusted", i.e. we can not trust it
4467                                  * to always return the same contents.
4468                                  * Since the target is a "privileged" process,
4469                                  * let's treat this as a copy-on-read fault, as
4470                                  * if it was a copy-on-write fault.
4471                                  * Once "object" gets a copy of this page, it
4472                                  * won't have to rely on "cur_object" to
4473                                  * provide the contents again.
4474                                  *
4475                                  * This is done by setting "need_copy" and
4476                                  * retrying the fault from the top with the
4477                                  * appropriate locking.
4478                                  *
4479                                  * Special case: if the mapping is executable
4480                                  * and the untrusted object is code-signed and
4481                                  * the process is "cs_enforced", we do not
4482                                  * copy-on-read because that would break
4483                                  * code-signing enforcement expectations (an
4484                                  * executable page must belong to a code-signed
4485                                  * object) and we can rely on code-signing
4486                                  * to re-validate the page if it gets evicted
4487                                  * and paged back in.
4488                                  */
4489 //                              printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4490                                 vm_copied_on_read++;
4491                                 need_copy = TRUE;
4492
4493                                 vm_object_unlock(object);
4494                                 vm_object_unlock(cur_object);
4495                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4496                                 vm_map_unlock_read(map);
4497                                 if (real_map != map) {
4498                                         vm_map_unlock(real_map);
4499                                 }
4500                                 goto RetryFault;
4501                         }
4502
4503                         if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
4504                                 if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
4505                                         prot &= ~VM_PROT_WRITE;
4506                                 } else {
4507                                         /*
4508                                          * For a protection that the pmap cares
4509                                          * about, we must hand over the full
4510                                          * set of protections (so that the pmap
4511                                          * layer can apply any desired policy).
4512                                          * This means that cs_bypass must be
4513                                          * set, as this can force us to pass
4514                                          * RWX.
4515                                          */
4516                                         assert(fault_info.cs_bypass);
4517                                 }
4518
4519                                 if (object != cur_object) {
4520                                         /*
4521                                          * We still need to hold the top object
4522                                          * lock here to prevent a race between
4523                                          * a read fault (taking only "shared"
4524                                          * locks) and a write fault (taking
4525                                          * an "exclusive" lock on the top
4526                                          * object.
4527                                          * Otherwise, as soon as we release the
4528                                          * top lock, the write fault could
4529                                          * proceed and actually complete before
4530                                          * the read fault, and the copied page's
4531                                          * translation could then be overwritten
4532                                          * by the read fault's translation for
4533                                          * the original page.
4534                                          *
4535                                          * Let's just record what the top object
4536                                          * is and we'll release it later.
4537                                          */
4538                                         top_object = object;
4539
4540                                         /*
4541                                          * switch to the object that has the new page
4542                                          */
4543                                         object = cur_object;
4544                                         object_lock_type = cur_object_lock_type;
4545                                 }
4546 FastPmapEnter:
4547                                 assert(m_object == VM_PAGE_OBJECT(m));
4548
4549                                 /*
4550                                  * prepare for the pmap_enter...
4551                                  * object and map are both locked
4552                                  * m contains valid data
4553                                  * object == m->vmp_object
4554                                  * cur_object == NULL or it's been unlocked
4555                                  * no paging references on either object or cur_object
4556                                  */
4557                                 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
4558                                         need_retry_ptr = &need_retry;
4559                                 } else {
4560                                         need_retry_ptr = NULL;
4561                                 }
4562
4563                                 if (fault_page_size < PAGE_SIZE) {
4564                                         DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
4565                                         assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
4566                                             fault_phys_offset < PAGE_SIZE),
4567                                             "0x%llx\n", (uint64_t)fault_phys_offset);
4568                                 } else {
4569                                         assertf(fault_phys_offset == 0,
4570                                             "0x%llx\n", (uint64_t)fault_phys_offset);
4571                                 }
4572
4573                                 if (caller_pmap) {
4574                                         kr = vm_fault_enter(m,
4575                                             caller_pmap,
4576                                             caller_pmap_addr,
4577                                             fault_page_size,
4578                                             fault_phys_offset,
4579                                             prot,
4580                                             caller_prot,
4581                                             wired,
4582                                             change_wiring,
4583                                             wire_tag,
4584                                             &fault_info,
4585                                             need_retry_ptr,
4586                                             &type_of_fault);
4587                                 } else {
4588                                         kr = vm_fault_enter(m,
4589                                             pmap,
4590                                             vaddr,
4591                                             fault_page_size,
4592                                             fault_phys_offset,
4593                                             prot,
4594                                             caller_prot,
4595                                             wired,
4596                                             change_wiring,
4597                                             wire_tag,
4598                                             &fault_info,
4599                                             need_retry_ptr,
4600                                             &type_of_fault);
4601                                 }
4602
4603                                 vm_fault_complete(
4604                                         map,
4605                                         real_map,
4606                                         object,
4607                                         m_object,
4608                                         m,
4609                                         offset,
4610                                         trace_real_vaddr,
4611                                         &fault_info,
4612                                         caller_prot,
4613                                         real_vaddr,
4614                                         vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
4615                                         need_retry,
4616                                         kr,
4617                                         physpage_p,
4618                                         prot,
4619                                         top_object,
4620                                         need_collapse,
4621                                         cur_offset,
4622                                         fault_type,
4623                                         &written_on_object,
4624                                         &written_on_pager,
4625                                         &written_on_offset);
4626                                 top_object = VM_OBJECT_NULL;
4627                                 if (need_retry == TRUE) {
4628                                         /*
4629                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
4630                                          * at this point we don't hold any locks so it's safe
4631                                          * to ask the pmap layer to expand the page table to
4632                                          * accommodate this mapping... once expanded, we'll
4633                                          * re-drive the fault which should result in vm_fault_enter
4634                                          * being able to successfully enter the mapping this time around
4635                                          */
4636                                         (void)pmap_enter_options(
4637                                                 pmap, vaddr, 0, 0, 0, 0, 0,
4638                                                 PMAP_OPTIONS_NOENTER, NULL);
4639
4640                                         need_retry = FALSE;
4641                                         goto RetryFault;
4642                                 }
4643                                 goto done;
4644                         }
4645                         /*
4646                          * COPY ON WRITE FAULT
4647                          */
4648                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4649
4650                         /*
4651                          * If objects match, then
4652                          * object->copy must not be NULL (else control
4653                          * would be in previous code block), and we
4654                          * have a potential push into the copy object
4655                          * with which we can't cope with here.
4656                          */
4657                         if (cur_object == object) {
4658                                 /*
4659                                  * must take the slow path to
4660                                  * deal with the copy push
4661                                  */
4662                                 break;
4663                         }
4664
4665                         /*
4666                          * This is now a shadow based copy on write
4667                          * fault -- it requires a copy up the shadow
4668                          * chain.
4669                          */
4670                         assert(m_object == VM_PAGE_OBJECT(m));
4671
4672                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4673                             vm_fault_cs_need_validation(NULL, m, m_object,
4674                             PAGE_SIZE, 0)) {
4675                                 goto upgrade_lock_and_retry;
4676                         }
4677
4678                         /*
4679                          * Allocate a page in the original top level
4680                          * object. Give up if allocate fails.  Also
4681                          * need to remember current page, as it's the
4682                          * source of the copy.
4683                          *
4684                          * at this point we hold locks on both
4685                          * object and cur_object... no need to take
4686                          * paging refs or mark pages BUSY since
4687                          * we don't drop either object lock until
4688                          * the page has been copied and inserted
4689                          */
4690                         cur_m = m;
4691                         m = vm_page_grab_options(grab_options);
4692                         m_object = NULL;
4693
4694                         if (m == VM_PAGE_NULL) {
4695                                 /*
4696                                  * no free page currently available...
4697                                  * must take the slow path
4698                                  */
4699                                 break;
4700                         }
4701                         /*
4702                          * Now do the copy.  Mark the source page busy...
4703                          *
4704                          *      NOTE: This code holds the map lock across
4705                          *      the page copy.
4706                          */
4707                         vm_page_copy(cur_m, m);
4708                         vm_page_insert(m, object, vm_object_trunc_page(offset));
4709                         if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
4710                                 DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4711                         }
4712                         m_object = object;
4713                         SET_PAGE_DIRTY(m, FALSE);
4714
4715                         /*
4716                          * Now cope with the source page and object
4717                          */
4718                         if (object->ref_count > 1 && cur_m->vmp_pmapped) {
4719                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4720                         } else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
4721                                 /*
4722                                  * We've copied the full 16K page but we're
4723                                  * about to call vm_fault_enter() only for
4724                                  * the 4K chunk we're faulting on.  The other
4725                                  * three 4K chunks in that page could still
4726                                  * be pmapped in this pmap.
4727                                  * Since the VM object layer thinks that the
4728                                  * entire page has been dealt with and the
4729                                  * original page might no longer be needed,
4730                                  * it might collapse/bypass the original VM
4731                                  * object and free its pages, which would be
4732                                  * bad (and would trigger pmap_verify_free()
4733                                  * assertions) if the other 4K chunks are still
4734                                  * pmapped.
4735                                  */
4736                                 /*
4737                                  * XXX FBDP TODO4K: to be revisisted
4738                                  * Technically, we need to pmap_disconnect()
4739                                  * only the target pmap's mappings for the 4K
4740                                  * chunks of this 16K VM page.  If other pmaps
4741                                  * have PTEs on these chunks, that means that
4742                                  * the associated VM map must have a reference
4743                                  * on the VM object, so no need to worry about
4744                                  * those.
4745                                  * pmap_protect() for each 4K chunk would be
4746                                  * better but we'd have to check which chunks
4747                                  * are actually mapped before and after this
4748                                  * one.
4749                                  * A full-blown pmap_disconnect() is easier
4750                                  * for now but not efficient.
4751                                  */
4752                                 DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
4753                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4754                         }
4755
4756                         if (cur_m->vmp_clustered) {
4757                                 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4758                                 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4759                                 vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4760                         }
4761                         need_collapse = TRUE;
4762
4763                         if (!cur_object->internal &&
4764                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4765                                 /*
4766                                  * The object from which we've just
4767                                  * copied a page is most probably backed
4768                                  * by a vnode.  We don't want to waste too
4769                                  * much time trying to collapse the VM objects
4770                                  * and create a bottleneck when several tasks
4771                                  * map the same file.
4772                                  */
4773                                 if (cur_object->copy == object) {
4774                                         /*
4775                                          * Shared mapping or no COW yet.
4776                                          * We can never collapse a copy
4777                                          * object into its backing object.
4778                                          */
4779                                         need_collapse = FALSE;
4780                                 } else if (cur_object->copy == object->shadow &&
4781                                     object->shadow->resident_page_count == 0) {
4782                                         /*
4783                                          * Shared mapping after a COW occurred.
4784                                          */
4785                                         need_collapse = FALSE;
4786                                 }
4787                         }
4788                         vm_object_unlock(cur_object);
4789
4790                         if (need_collapse == FALSE) {
4791                                 vm_fault_collapse_skipped++;
4792                         }
4793                         vm_fault_collapse_total++;
4794
4795                         type_of_fault = DBG_COW_FAULT;
4796                         VM_STAT_INCR(cow_faults);
4797                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4798                         current_task()->cow_faults++;
4799
4800                         goto FastPmapEnter;
4801                 } else {
4802                         /*
4803                          * No page at cur_object, cur_offset... m == NULL
4804                          */
4805                         if (cur_object->pager_created) {
4806                                 vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4807
4808                                 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4809                                         int             my_fault_type;
4810                                         uint8_t         c_flags = C_DONT_BLOCK;
4811                                         bool            insert_cur_object = FALSE;
4812
4813                                         /*
4814                                          * May have to talk to a pager...
4815                                          * if so, take the slow path by
4816                                          * doing a 'break' from the while (TRUE) loop
4817                                          *
4818                                          * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4819                                          * if the compressor is active and the page exists there
4820                                          */
4821                                         if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
4822                                                 break;
4823                                         }
4824
4825                                         if (map == kernel_map || real_map == kernel_map) {
4826                                                 /*
4827                                                  * can't call into the compressor with the kernel_map
4828                                                  * lock held, since the compressor may try to operate
4829                                                  * on the kernel map in order to return an empty c_segment
4830                                                  */
4831                                                 break;
4832                                         }
4833                                         if (object != cur_object) {
4834                                                 if (fault_type & VM_PROT_WRITE) {
4835                                                         c_flags |= C_KEEP;
4836                                                 } else {
4837                                                         insert_cur_object = TRUE;
4838                                                 }
4839                                         }
4840                                         if (insert_cur_object == TRUE) {
4841                                                 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4842                                                         cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4843
4844                                                         if (vm_object_lock_upgrade(cur_object) == FALSE) {
4845                                                                 /*
4846                                                                  * couldn't upgrade so go do a full retry
4847                                                                  * immediately since we can no longer be
4848                                                                  * certain about cur_object (since we
4849                                                                  * don't hold a reference on it)...
4850                                                                  * first drop the top object lock
4851                                                                  */
4852                                                                 vm_object_unlock(object);
4853
4854                                                                 vm_map_unlock_read(map);
4855                                                                 if (real_map != map) {
4856                                                                         vm_map_unlock(real_map);
4857                                                                 }
4858
4859                                                                 goto RetryFault;
4860                                                         }
4861                                                 }
4862                                         } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4863                                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4864
4865                                                 if (object != cur_object) {
4866                                                         /*
4867                                                          * we can't go for the upgrade on the top
4868                                                          * lock since the upgrade may block waiting
4869                                                          * for readers to drain... since we hold
4870                                                          * cur_object locked at this point, waiting
4871                                                          * for the readers to drain would represent
4872                                                          * a lock order inversion since the lock order
4873                                                          * for objects is the reference order in the
4874                                                          * shadown chain
4875                                                          */
4876                                                         vm_object_unlock(object);
4877                                                         vm_object_unlock(cur_object);
4878
4879                                                         vm_map_unlock_read(map);
4880                                                         if (real_map != map) {
4881                                                                 vm_map_unlock(real_map);
4882                                                         }
4883
4884                                                         goto RetryFault;
4885                                                 }
4886                                                 if (vm_object_lock_upgrade(object) == FALSE) {
4887                                                         /*
4888                                                          * couldn't upgrade, so explictly take the lock
4889                                                          * exclusively and go relookup the page since we
4890                                                          * will have dropped the object lock and
4891                                                          * a different thread could have inserted
4892                                                          * a page at this offset
4893                                                          * no need for a full retry since we're
4894                                                          * at the top level of the object chain
4895                                                          */
4896                                                         vm_object_lock(object);
4897
4898                                                         continue;
4899                                                 }
4900                                         }
4901                                         m = vm_page_grab_options(grab_options);
4902                                         m_object = NULL;
4903
4904                                         if (m == VM_PAGE_NULL) {
4905                                                 /*
4906                                                  * no free page currently available...
4907                                                  * must take the slow path
4908                                                  */
4909                                                 break;
4910                                         }
4911
4912                                         /*
4913                                          * The object is and remains locked
4914                                          * so no need to take a
4915                                          * "paging_in_progress" reference.
4916                                          */
4917                                         bool      shared_lock;
4918                                         if ((object == cur_object &&
4919                                             object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4920                                             (object != cur_object &&
4921                                             cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4922                                                 shared_lock = FALSE;
4923                                         } else {
4924                                                 shared_lock = TRUE;
4925                                         }
4926
4927                                         kr = vm_compressor_pager_get(
4928                                                 cur_object->pager,
4929                                                 (vm_object_trunc_page(cur_offset)
4930                                                 + cur_object->paging_offset),
4931                                                 VM_PAGE_GET_PHYS_PAGE(m),
4932                                                 &my_fault_type,
4933                                                 c_flags,
4934                                                 &compressed_count_delta);
4935
4936                                         vm_compressor_pager_count(
4937                                                 cur_object->pager,
4938                                                 compressed_count_delta,
4939                                                 shared_lock,
4940                                                 cur_object);
4941
4942                                         if (kr != KERN_SUCCESS) {
4943                                                 vm_page_release(m, FALSE);
4944                                                 m = VM_PAGE_NULL;
4945                                         }
4946                                         /*
4947                                          * If vm_compressor_pager_get() returns
4948                                          * KERN_MEMORY_FAILURE, then the
4949                                          * compressed data is permanently lost,
4950                                          * so return this error immediately.
4951                                          */
4952                                         if (kr == KERN_MEMORY_FAILURE) {
4953                                                 if (object != cur_object) {
4954                                                         vm_object_unlock(cur_object);
4955                                                 }
4956                                                 vm_object_unlock(object);
4957                                                 vm_map_unlock_read(map);
4958                                                 if (real_map != map) {
4959                                                         vm_map_unlock(real_map);
4960                                                 }
4961                                                 goto done;
4962                                         } else if (kr != KERN_SUCCESS) {
4963                                                 break;
4964                                         }
4965                                         m->vmp_dirty = TRUE;
4966
4967                                         /*
4968                                          * If the object is purgeable, its
4969                                          * owner's purgeable ledgers will be
4970                                          * updated in vm_page_insert() but the
4971                                          * page was also accounted for in a
4972                                          * "compressed purgeable" ledger, so
4973                                          * update that now.
4974                                          */
4975                                         if (object != cur_object &&
4976                                             !insert_cur_object) {
4977                                                 /*
4978                                                  * We're not going to insert
4979                                                  * the decompressed page into
4980                                                  * the object it came from.
4981                                                  *
4982                                                  * We're dealing with a
4983                                                  * copy-on-write fault on
4984                                                  * "object".
4985                                                  * We're going to decompress
4986                                                  * the page directly into the
4987                                                  * target "object" while
4988                                                  * keepin the compressed
4989                                                  * page for "cur_object", so
4990                                                  * no ledger update in that
4991                                                  * case.
4992                                                  */
4993                                         } else if (((cur_object->purgable ==
4994                                             VM_PURGABLE_DENY) &&
4995                                             (!cur_object->vo_ledger_tag)) ||
4996                                             (cur_object->vo_owner ==
4997                                             NULL)) {
4998                                                 /*
4999                                                  * "cur_object" is not purgeable
5000                                                  * and is not ledger-taged, or
5001                                                  * there's no owner for it,
5002                                                  * so no owner's ledgers to
5003                                                  * update.
5004                                                  */
5005                                         } else {
5006                                                 /*
5007                                                  * One less compressed
5008                                                  * purgeable/tagged page for
5009                                                  * cur_object's owner.
5010                                                  */
5011                                                 vm_object_owner_compressed_update(
5012                                                         cur_object,
5013                                                         -1);
5014                                         }
5015
5016                                         if (insert_cur_object) {
5017                                                 vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
5018                                                 m_object = cur_object;
5019                                         } else {
5020                                                 vm_page_insert(m, object, vm_object_trunc_page(offset));
5021                                                 m_object = object;
5022                                         }
5023
5024                                         if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
5025                                                 /*
5026                                                  * If the page is not cacheable,
5027                                                  * we can't let its contents
5028                                                  * linger in the data cache
5029                                                  * after the decompression.
5030                                                  */
5031                                                 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
5032                                         }
5033
5034                                         type_of_fault = my_fault_type;
5035
5036                                         VM_STAT_DECOMPRESSIONS();
5037
5038                                         if (cur_object != object) {
5039                                                 if (insert_cur_object) {
5040                                                         top_object = object;
5041                                                         /*
5042                                                          * switch to the object that has the new page
5043                                                          */
5044                                                         object = cur_object;
5045                                                         object_lock_type = cur_object_lock_type;
5046                                                 } else {
5047                                                         vm_object_unlock(cur_object);
5048                                                         cur_object = object;
5049                                                 }
5050                                         }
5051                                         goto FastPmapEnter;
5052                                 }
5053                                 /*
5054                                  * existence map present and indicates
5055                                  * that the pager doesn't have this page
5056                                  */
5057                         }
5058                         if (cur_object->shadow == VM_OBJECT_NULL ||
5059                             resilient_media_retry) {
5060                                 /*
5061                                  * Zero fill fault.  Page gets
5062                                  * inserted into the original object.
5063                                  */
5064                                 if (cur_object->shadow_severed ||
5065                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
5066                                     cur_object == compressor_object ||
5067                                     cur_object == kernel_object ||
5068                                     cur_object == vm_submap_object) {
5069                                         if (object != cur_object) {
5070                                                 vm_object_unlock(cur_object);
5071                                         }
5072                                         vm_object_unlock(object);
5073
5074                                         vm_map_unlock_read(map);
5075                                         if (real_map != map) {
5076                                                 vm_map_unlock(real_map);
5077                                         }
5078
5079                                         kr = KERN_MEMORY_ERROR;
5080                                         goto done;
5081                                 }
5082                                 if (cur_object != object) {
5083                                         vm_object_unlock(cur_object);
5084
5085                                         cur_object = object;
5086                                 }
5087                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
5088                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5089
5090                                         if (vm_object_lock_upgrade(object) == FALSE) {
5091                                                 /*
5092                                                  * couldn't upgrade so do a full retry on the fault
5093                                                  * since we dropped the object lock which
5094                                                  * could allow another thread to insert
5095                                                  * a page at this offset
5096                                                  */
5097                                                 vm_map_unlock_read(map);
5098                                                 if (real_map != map) {
5099                                                         vm_map_unlock(real_map);
5100                                                 }
5101
5102                                                 goto RetryFault;
5103                                         }
5104                                 }
5105                                 if (!object->internal) {
5106                                         panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
5107                                 }
5108                                 m = vm_page_alloc(object, vm_object_trunc_page(offset));
5109                                 m_object = NULL;
5110
5111                                 if (m == VM_PAGE_NULL) {
5112                                         /*
5113                                          * no free page currently available...
5114                                          * must take the slow path
5115                                          */
5116                                         break;
5117                                 }
5118                                 m_object = object;
5119
5120                                 /*
5121                                  * Zeroing the page and entering into it into the pmap
5122                                  * represents a significant amount of the zero fill fault handler's work.
5123                                  *
5124                                  * To improve fault scalability, we'll drop the object lock, if it appears contended,
5125                                  * now that we've inserted the page into the vm object.
5126                                  * Before dropping the lock, we need to check protection bits and set the
5127                                  * mapped bits on the page. Then we can mark the page busy, drop the lock,
5128                                  * zero it, and do the pmap enter. We'll need to reacquire the lock
5129                                  * to clear the busy bit and wake up any waiters.
5130                                  */
5131                                 vm_fault_cs_clear(m);
5132                                 m->vmp_pmapped = TRUE;
5133                                 if (map->no_zero_fill) {
5134                                         type_of_fault = DBG_NZF_PAGE_FAULT;
5135                                 } else {
5136                                         type_of_fault = DBG_ZERO_FILL_FAULT;
5137                                 }
5138                                 {
5139                                         pmap_t destination_pmap;
5140                                         vm_map_offset_t destination_pmap_vaddr;
5141                                         vm_prot_t enter_fault_type;
5142                                         if (caller_pmap) {
5143                                                 destination_pmap = caller_pmap;
5144                                                 destination_pmap_vaddr = caller_pmap_addr;
5145                                         } else {
5146                                                 destination_pmap = pmap;
5147                                                 destination_pmap_vaddr = vaddr;
5148                                         }
5149                                         if (change_wiring) {
5150                                                 enter_fault_type = VM_PROT_NONE;
5151                                         } else {
5152                                                 enter_fault_type = caller_prot;
5153                                         }
5154                                         kr = vm_fault_enter_prepare(m,
5155                                             destination_pmap,
5156                                             destination_pmap_vaddr,
5157                                             &prot,
5158                                             caller_prot,
5159                                             fault_page_size,
5160                                             fault_phys_offset,
5161                                             change_wiring,
5162                                             enter_fault_type,
5163                                             &fault_info,
5164                                             &type_of_fault,
5165                                             &page_needs_data_sync);
5166                                         if (kr != KERN_SUCCESS) {
5167                                                 goto zero_fill_cleanup;
5168                                         }
5169
5170                                         if (object_is_contended) {
5171                                                 /*
5172                                                  * At this point the page is in the vm object, but not on a paging queue.
5173                                                  * Since it's accessible to another thread but its contents are invalid
5174                                                  * (it hasn't been zeroed) mark it busy before dropping the object lock.
5175                                                  */
5176                                                 m->vmp_busy = TRUE;
5177                                                 vm_object_unlock(object);
5178                                         }
5179                                         if (type_of_fault == DBG_ZERO_FILL_FAULT) {
5180                                                 /*
5181                                                  * Now zero fill page...
5182                                                  * the page is probably going to
5183                                                  * be written soon, so don't bother
5184                                                  * to clear the modified bit
5185                                                  *
5186                                                  *   NOTE: This code holds the map
5187                                                  *   lock across the zero fill.
5188                                                  */
5189                                                 vm_page_zero_fill(m);
5190                                                 VM_STAT_INCR(zero_fill_count);
5191                                                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
5192                                         }
5193                                         if (page_needs_data_sync) {
5194                                                 pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
5195                                         }
5196
5197                                         if (top_object != VM_OBJECT_NULL) {
5198                                                 need_retry_ptr = &need_retry;
5199                                         } else {
5200                                                 need_retry_ptr = NULL;
5201                                         }
5202                                         if (object_is_contended) {
5203                                                 kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
5204                                                     fault_page_size, fault_phys_offset,
5205                                                     m, &prot, caller_prot, enter_fault_type, wired,
5206                                                     fault_info.pmap_options, need_retry_ptr);
5207                                                 vm_object_lock(object);
5208                                         } else {
5209                                                 kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
5210                                                     fault_page_size, fault_phys_offset,
5211                                                     m, &prot, caller_prot, enter_fault_type, wired,
5212                                                     fault_info.pmap_options, need_retry_ptr);
5213                                         }
5214                                 }
5215 zero_fill_cleanup:
5216                                 if (!VM_DYNAMIC_PAGING_ENABLED() &&
5217                                     (object->purgable == VM_PURGABLE_DENY ||
5218                                     object->purgable == VM_PURGABLE_NONVOLATILE ||
5219                                     object->purgable == VM_PURGABLE_VOLATILE)) {
5220                                         vm_page_lockspin_queues();
5221                                         if (!VM_DYNAMIC_PAGING_ENABLED()) {
5222                                                 vm_fault_enqueue_throttled_locked(m);
5223                                         }
5224                                         vm_page_unlock_queues();
5225                                 }
5226                                 vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info.no_cache, &type_of_fault, kr);
5227
5228                                 vm_fault_complete(
5229                                         map,
5230                                         real_map,
5231                                         object,
5232                                         m_object,
5233                                         m,
5234                                         offset,
5235                                         trace_real_vaddr,
5236                                         &fault_info,
5237                                         caller_prot,
5238                                         real_vaddr,
5239                                         type_of_fault,
5240                                         need_retry,
5241                                         kr,
5242                                         physpage_p,
5243                                         prot,
5244                                         top_object,
5245                                         need_collapse,
5246                                         cur_offset,
5247                                         fault_type,
5248                                         &written_on_object,
5249                                         &written_on_pager,
5250                                         &written_on_offset);
5251                                 top_object = VM_OBJECT_NULL;
5252                                 if (need_retry == TRUE) {
5253                                         /*
5254                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
5255                                          * at this point we don't hold any locks so it's safe
5256                                          * to ask the pmap layer to expand the page table to
5257                                          * accommodate this mapping... once expanded, we'll
5258                                          * re-drive the fault which should result in vm_fault_enter
5259                                          * being able to successfully enter the mapping this time around
5260                                          */
5261                                         (void)pmap_enter_options(
5262                                                 pmap, vaddr, 0, 0, 0, 0, 0,
5263                                                 PMAP_OPTIONS_NOENTER, NULL);
5264
5265                                         need_retry = FALSE;
5266                                         goto RetryFault;
5267                                 }
5268                                 goto done;
5269                         }
5270                         /*
5271                          * On to the next level in the shadow chain
5272                          */
5273                         cur_offset += cur_object->vo_shadow_offset;
5274                         new_object = cur_object->shadow;
5275                         fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
5276
5277                         /*
5278                          * take the new_object's lock with the indicated state
5279                          */
5280                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5281                                 vm_object_lock_shared(new_object);
5282                         } else {
5283                                 vm_object_lock(new_object);
5284                         }
5285
5286                         if (cur_object != object) {
5287                                 vm_object_unlock(cur_object);
5288                         }
5289
5290                         cur_object = new_object;
5291
5292                         continue;
5293                 }
5294         }
5295         /*
5296          * Cleanup from fast fault failure.  Drop any object
5297          * lock other than original and drop map lock.
5298          */
5299         if (object != cur_object) {
5300                 vm_object_unlock(cur_object);
5301         }
5302
5303         /*
5304          * must own the object lock exclusively at this point
5305          */
5306         if (object_lock_type == OBJECT_LOCK_SHARED) {
5307                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5308
5309                 if (vm_object_lock_upgrade(object) == FALSE) {
5310                         /*
5311                          * couldn't upgrade, so explictly
5312                          * take the lock exclusively
5313                          * no need to retry the fault at this
5314                          * point since "vm_fault_page" will
5315                          * completely re-evaluate the state
5316                          */
5317                         vm_object_lock(object);
5318                 }
5319         }
5320
5321 handle_copy_delay:
5322         vm_map_unlock_read(map);
5323         if (real_map != map) {
5324                 vm_map_unlock(real_map);
5325         }
5326
5327         if (__improbable(object == compressor_object ||
5328             object == kernel_object ||
5329             object == vm_submap_object)) {
5330                 /*
5331                  * These objects are explicitly managed and populated by the
5332                  * kernel.  The virtual ranges backed by these objects should
5333                  * either have wired pages or "holes" that are not supposed to
5334                  * be accessed at all until they get explicitly populated.
5335                  * We should never have to resolve a fault on a mapping backed
5336                  * by one of these VM objects and providing a zero-filled page
5337                  * would be wrong here, so let's fail the fault and let the
5338                  * caller crash or recover.
5339                  */
5340                 vm_object_unlock(object);
5341                 kr = KERN_MEMORY_ERROR;
5342                 goto done;
5343         }
5344
5345         assert(object != compressor_object);
5346         assert(object != kernel_object);
5347         assert(object != vm_submap_object);
5348
5349         if (resilient_media_retry) {
5350                 /*
5351                  * We could get here if we failed to get a free page
5352                  * to zero-fill and had to take the slow path again.
5353                  * Reset our "recovery-from-failed-media" state.
5354                  */
5355                 assert(resilient_media_object != VM_OBJECT_NULL);
5356                 assert(resilient_media_offset != (vm_object_offset_t)-1);
5357                 /* release our extra reference on failed object */
5358 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
5359                 vm_object_deallocate(resilient_media_object);
5360                 resilient_media_object = VM_OBJECT_NULL;
5361                 resilient_media_offset = (vm_object_offset_t)-1;
5362                 resilient_media_retry = FALSE;
5363         }
5364
5365         /*
5366          * Make a reference to this object to
5367          * prevent its disposal while we are messing with
5368          * it.  Once we have the reference, the map is free
5369          * to be diddled.  Since objects reference their
5370          * shadows (and copies), they will stay around as well.
5371          */
5372         vm_object_reference_locked(object);
5373         vm_object_paging_begin(object);
5374
5375         set_thread_pagein_error(cthread, 0);
5376         error_code = 0;
5377
5378         result_page = VM_PAGE_NULL;
5379         kr = vm_fault_page(object, offset, fault_type,
5380             (change_wiring && !wired),
5381             FALSE,                /* page not looked up */
5382             &prot, &result_page, &top_page,
5383             &type_of_fault,
5384             &error_code, map->no_zero_fill,
5385             FALSE, &fault_info);
5386
5387         /*
5388          * if kr != VM_FAULT_SUCCESS, then the paging reference
5389          * has been dropped and the object unlocked... the ref_count
5390          * is still held
5391          *
5392          * if kr == VM_FAULT_SUCCESS, then the paging reference
5393          * is still held along with the ref_count on the original object
5394          *
5395          *      the object is returned locked with a paging reference
5396          *
5397          *      if top_page != NULL, then it's BUSY and the
5398          *      object it belongs to has a paging reference
5399          *      but is returned unlocked
5400          */
5401         if (kr != VM_FAULT_SUCCESS &&
5402             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
5403                 if (kr == VM_FAULT_MEMORY_ERROR &&
5404                     fault_info.resilient_media) {
5405                         assertf(object->internal, "object %p", object);
5406                         /*
5407                          * This fault failed but the mapping was
5408                          * "media resilient", so we'll retry the fault in
5409                          * recovery mode to get a zero-filled page in the
5410                          * top object.
5411                          * Keep the reference on the failing object so
5412                          * that we can check that the mapping is still
5413                          * pointing to it when we retry the fault.
5414                          */
5415 //                     printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
5416                         assert(!resilient_media_retry); /* no double retry */
5417                         assert(resilient_media_object == VM_OBJECT_NULL);
5418                         assert(resilient_media_offset == (vm_object_offset_t)-1);
5419                         resilient_media_retry = TRUE;
5420                         resilient_media_object = object;
5421                         resilient_media_offset = offset;
5422 //                     printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
5423                         goto RetryFault;
5424                 } else {
5425                         /*
5426                          * we didn't succeed, lose the object reference
5427                          * immediately.
5428                          */
5429                         vm_object_deallocate(object);
5430                         object = VM_OBJECT_NULL; /* no longer valid */
5431                 }
5432
5433                 /*
5434                  * See why we failed, and take corrective action.
5435                  */
5436                 switch (kr) {
5437                 case VM_FAULT_MEMORY_SHORTAGE:
5438                         if (vm_page_wait((change_wiring) ?
5439                             THREAD_UNINT :
5440                             THREAD_ABORTSAFE)) {
5441                                 goto RetryFault;
5442                         }
5443                         OS_FALLTHROUGH;
5444                 case VM_FAULT_INTERRUPTED:
5445                         kr = KERN_ABORTED;
5446                         goto done;
5447                 case VM_FAULT_RETRY:
5448                         goto RetryFault;
5449                 case VM_FAULT_MEMORY_ERROR:
5450                         if (error_code) {
5451                                 kr = error_code;
5452                         } else {
5453                                 kr = KERN_MEMORY_ERROR;
5454                         }
5455                         goto done;
5456                 default:
5457                         panic("vm_fault: unexpected error 0x%x from "
5458                             "vm_fault_page()\n", kr);
5459                 }
5460         }
5461         m = result_page;
5462         m_object = NULL;
5463
5464         if (m != VM_PAGE_NULL) {
5465                 m_object = VM_PAGE_OBJECT(m);
5466                 assert((change_wiring && !wired) ?
5467                     (top_page == VM_PAGE_NULL) :
5468                     ((top_page == VM_PAGE_NULL) == (m_object == object)));
5469         }
5470
5471         /*
5472          * What to do with the resulting page from vm_fault_page
5473          * if it doesn't get entered into the physical map:
5474          */
5475 #define RELEASE_PAGE(m)                                 \
5476         MACRO_BEGIN                                     \
5477         PAGE_WAKEUP_DONE(m);                            \
5478         if ( !VM_PAGE_PAGEABLE(m)) {                    \
5479                 vm_page_lockspin_queues();              \
5480                 if ( !VM_PAGE_PAGEABLE(m))              \
5481                         vm_page_activate(m);            \
5482                 vm_page_unlock_queues();                \
5483         }                                               \
5484         MACRO_END
5485
5486
5487         object_locks_dropped = FALSE;
5488         /*
5489          * We must verify that the maps have not changed
5490          * since our last lookup. vm_map_verify() needs the
5491          * map lock (shared) but we are holding object locks.
5492          * So we do a try_lock() first and, if that fails, we
5493          * drop the object locks and go in for the map lock again.
5494          */
5495         if (!vm_map_try_lock_read(original_map)) {
5496                 if (m != VM_PAGE_NULL) {
5497                         old_copy_object = m_object->copy;
5498                         vm_object_unlock(m_object);
5499                 } else {
5500                         old_copy_object = VM_OBJECT_NULL;
5501                         vm_object_unlock(object);
5502                 }
5503
5504                 object_locks_dropped = TRUE;
5505
5506                 vm_map_lock_read(original_map);
5507         }
5508
5509         if ((map != original_map) || !vm_map_verify(map, &version)) {
5510                 if (object_locks_dropped == FALSE) {
5511                         if (m != VM_PAGE_NULL) {
5512                                 old_copy_object = m_object->copy;
5513                                 vm_object_unlock(m_object);
5514                         } else {
5515                                 old_copy_object = VM_OBJECT_NULL;
5516                                 vm_object_unlock(object);
5517                         }
5518
5519                         object_locks_dropped = TRUE;
5520                 }
5521
5522                 /*
5523                  * no object locks are held at this point
5524                  */
5525                 vm_object_t             retry_object;
5526                 vm_object_offset_t      retry_offset;
5527                 vm_prot_t               retry_prot;
5528
5529                 /*
5530                  * To avoid trying to write_lock the map while another
5531                  * thread has it read_locked (in vm_map_pageable), we
5532                  * do not try for write permission.  If the page is
5533                  * still writable, we will get write permission.  If it
5534                  * is not, or has been marked needs_copy, we enter the
5535                  * mapping without write permission, and will merely
5536                  * take another fault.
5537                  */
5538                 map = original_map;
5539
5540                 kr = vm_map_lookup_locked(&map, vaddr,
5541                     fault_type & ~VM_PROT_WRITE,
5542                     OBJECT_LOCK_EXCLUSIVE, &version,
5543                     &retry_object, &retry_offset, &retry_prot,
5544                     &wired,
5545                     &fault_info,
5546                     &real_map,
5547                     NULL);
5548                 pmap = real_map->pmap;
5549
5550                 if (kr != KERN_SUCCESS) {
5551                         vm_map_unlock_read(map);
5552
5553                         if (m != VM_PAGE_NULL) {
5554                                 assert(VM_PAGE_OBJECT(m) == m_object);
5555
5556                                 /*
5557                                  * retake the lock so that
5558                                  * we can drop the paging reference
5559                                  * in vm_fault_cleanup and do the
5560                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
5561                                  */
5562                                 vm_object_lock(m_object);
5563
5564                                 RELEASE_PAGE(m);
5565
5566                                 vm_fault_cleanup(m_object, top_page);
5567                         } else {
5568                                 /*
5569                                  * retake the lock so that
5570                                  * we can drop the paging reference
5571                                  * in vm_fault_cleanup
5572                                  */
5573                                 vm_object_lock(object);
5574
5575                                 vm_fault_cleanup(object, top_page);
5576                         }
5577                         vm_object_deallocate(object);
5578
5579                         goto done;
5580                 }
5581                 vm_object_unlock(retry_object);
5582
5583                 if ((retry_object != object) || (retry_offset != offset)) {
5584                         vm_map_unlock_read(map);
5585                         if (real_map != map) {
5586                                 vm_map_unlock(real_map);
5587                         }
5588
5589                         if (m != VM_PAGE_NULL) {
5590                                 assert(VM_PAGE_OBJECT(m) == m_object);
5591
5592                                 /*
5593                                  * retake the lock so that
5594                                  * we can drop the paging reference
5595                                  * in vm_fault_cleanup and do the
5596                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
5597                                  */
5598                                 vm_object_lock(m_object);
5599
5600                                 RELEASE_PAGE(m);
5601
5602                                 vm_fault_cleanup(m_object, top_page);
5603                         } else {
5604                                 /*
5605                                  * retake the lock so that
5606                                  * we can drop the paging reference
5607                                  * in vm_fault_cleanup
5608                                  */
5609                                 vm_object_lock(object);
5610
5611                                 vm_fault_cleanup(object, top_page);
5612                         }
5613                         vm_object_deallocate(object);
5614
5615                         goto RetryFault;
5616                 }
5617                 /*
5618                  * Check whether the protection has changed or the object
5619                  * has been copied while we left the map unlocked.
5620                  */
5621                 if (pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
5622                         /* If the pmap layer cares, pass the full set. */
5623                         prot = retry_prot;
5624                 } else {
5625                         prot &= retry_prot;
5626                 }
5627         }
5628
5629         if (object_locks_dropped == TRUE) {
5630                 if (m != VM_PAGE_NULL) {
5631                         vm_object_lock(m_object);
5632
5633                         if (m_object->copy != old_copy_object) {
5634                                 /*
5635                                  * The copy object changed while the top-level object
5636                                  * was unlocked, so take away write permission.
5637                                  */
5638                                 assert(!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot));
5639                                 prot &= ~VM_PROT_WRITE;
5640                         }
5641                 } else {
5642                         vm_object_lock(object);
5643                 }
5644
5645                 object_locks_dropped = FALSE;
5646         }
5647
5648         if (!need_copy &&
5649             !fault_info.no_copy_on_read &&
5650             m != VM_PAGE_NULL &&
5651             VM_PAGE_OBJECT(m) != object &&
5652             !VM_PAGE_OBJECT(m)->pager_trusted &&
5653             vm_protect_privileged_from_untrusted &&
5654             !((prot & VM_PROT_EXECUTE) &&
5655             VM_PAGE_OBJECT(m)->code_signed &&
5656             pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
5657             current_proc_is_privileged()) {
5658                 /*
5659                  * We found the page we want in an "untrusted" VM object
5660                  * down the shadow chain.  Since the target is "privileged"
5661                  * we want to perform a copy-on-read of that page, so that the
5662                  * mapped object gets a stable copy and does not have to
5663                  * rely on the "untrusted" object to provide the same
5664                  * contents if the page gets reclaimed and has to be paged
5665                  * in again later on.
5666                  *
5667                  * Special case: if the mapping is executable and the untrusted
5668                  * object is code-signed and the process is "cs_enforced", we
5669                  * do not copy-on-read because that would break code-signing
5670                  * enforcement expectations (an executable page must belong
5671                  * to a code-signed object) and we can rely on code-signing
5672                  * to re-validate the page if it gets evicted and paged back in.
5673                  */
5674 //              printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5675                 vm_copied_on_read++;
5676                 need_copy_on_read = TRUE;
5677                 need_copy = TRUE;
5678         } else {
5679                 need_copy_on_read = FALSE;
5680         }
5681
5682         /*
5683          * If we want to wire down this page, but no longer have
5684          * adequate permissions, we must start all over.
5685          * If we decided to copy-on-read, we must also start all over.
5686          */
5687         if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
5688             need_copy_on_read) {
5689                 vm_map_unlock_read(map);
5690                 if (real_map != map) {
5691                         vm_map_unlock(real_map);
5692                 }
5693
5694                 if (m != VM_PAGE_NULL) {
5695                         assert(VM_PAGE_OBJECT(m) == m_object);
5696
5697                         RELEASE_PAGE(m);
5698
5699                         vm_fault_cleanup(m_object, top_page);
5700                 } else {
5701                         vm_fault_cleanup(object, top_page);
5702                 }
5703
5704                 vm_object_deallocate(object);
5705
5706                 goto RetryFault;
5707         }
5708         if (m != VM_PAGE_NULL) {
5709                 /*
5710                  * Put this page into the physical map.
5711                  * We had to do the unlock above because pmap_enter
5712                  * may cause other faults.  The page may be on
5713                  * the pageout queues.  If the pageout daemon comes
5714                  * across the page, it will remove it from the queues.
5715                  */
5716                 if (fault_page_size < PAGE_SIZE) {
5717                         DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
5718                         assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
5719                             fault_phys_offset < PAGE_SIZE),
5720                             "0x%llx\n", (uint64_t)fault_phys_offset);
5721                 } else {
5722                         assertf(fault_phys_offset == 0,
5723                             "0x%llx\n", (uint64_t)fault_phys_offset);
5724                 }
5725                 if (caller_pmap) {
5726                         kr = vm_fault_enter(m,
5727                             caller_pmap,
5728                             caller_pmap_addr,
5729                             fault_page_size,
5730                             fault_phys_offset,
5731                             prot,
5732                             caller_prot,
5733                             wired,
5734                             change_wiring,
5735                             wire_tag,
5736                             &fault_info,
5737                             NULL,
5738                             &type_of_fault);
5739                 } else {
5740                         kr = vm_fault_enter(m,
5741                             pmap,
5742                             vaddr,
5743                             fault_page_size,
5744                             fault_phys_offset,
5745                             prot,
5746                             caller_prot,
5747                             wired,
5748                             change_wiring,
5749                             wire_tag,
5750                             &fault_info,
5751                             NULL,
5752                             &type_of_fault);
5753                 }
5754                 assert(VM_PAGE_OBJECT(m) == m_object);
5755
5756                 {
5757                         int     event_code = 0;
5758
5759                         if (m_object->internal) {
5760                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
5761                         } else if (m_object->object_is_shared_cache) {
5762                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
5763                         } else {
5764                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
5765                         }
5766
5767                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid(), 0);
5768                         KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid(), 0, 0, 0, 0);
5769
5770                         DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
5771                 }
5772                 if (kr != KERN_SUCCESS) {
5773                         /* abort this page fault */
5774                         vm_map_unlock_read(map);
5775                         if (real_map != map) {
5776                                 vm_map_unlock(real_map);
5777                         }
5778                         PAGE_WAKEUP_DONE(m);
5779                         vm_fault_cleanup(m_object, top_page);
5780                         vm_object_deallocate(object);
5781                         goto done;
5782                 }
5783                 if (physpage_p != NULL) {
5784                         /* for vm_map_wire_and_extract() */
5785                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
5786                         if (prot & VM_PROT_WRITE) {
5787                                 vm_object_lock_assert_exclusive(m_object);
5788                                 m->vmp_dirty = TRUE;
5789                         }
5790                 }
5791         } else {
5792                 vm_map_entry_t          entry;
5793                 vm_map_offset_t         laddr;
5794                 vm_map_offset_t         ldelta, hdelta;
5795
5796                 /*
5797                  * do a pmap block mapping from the physical address
5798                  * in the object
5799                  */
5800
5801                 if (real_map != map) {
5802                         vm_map_unlock(real_map);
5803                 }
5804
5805                 if (original_map != map) {
5806                         vm_map_unlock_read(map);
5807                         vm_map_lock_read(original_map);
5808                         map = original_map;
5809                 }
5810                 real_map = map;
5811
5812                 laddr = vaddr;
5813                 hdelta = 0xFFFFF000;
5814                 ldelta = 0xFFFFF000;
5815
5816                 while (vm_map_lookup_entry(map, laddr, &entry)) {
5817                         if (ldelta > (laddr - entry->vme_start)) {
5818                                 ldelta = laddr - entry->vme_start;
5819                         }
5820                         if (hdelta > (entry->vme_end - laddr)) {
5821                                 hdelta = entry->vme_end - laddr;
5822                         }
5823                         if (entry->is_sub_map) {
5824                                 laddr = ((laddr - entry->vme_start)
5825                                     + VME_OFFSET(entry));
5826                                 vm_map_lock_read(VME_SUBMAP(entry));
5827
5828                                 if (map != real_map) {
5829                                         vm_map_unlock_read(map);
5830                                 }
5831                                 if (entry->use_pmap) {
5832                                         vm_map_unlock_read(real_map);
5833                                         real_map = VME_SUBMAP(entry);
5834                                 }
5835                                 map = VME_SUBMAP(entry);
5836                         } else {
5837                                 break;
5838                         }
5839                 }
5840
5841                 if (vm_map_lookup_entry(map, laddr, &entry) &&
5842                     (VME_OBJECT(entry) != NULL) &&
5843                     (VME_OBJECT(entry) == object)) {
5844                         uint16_t superpage;
5845
5846                         if (!object->pager_created &&
5847                             object->phys_contiguous &&
5848                             VME_OFFSET(entry) == 0 &&
5849                             (entry->vme_end - entry->vme_start == object->vo_size) &&
5850                             VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
5851                                 superpage = VM_MEM_SUPERPAGE;
5852                         } else {
5853                                 superpage = 0;
5854                         }
5855
5856                         if (superpage && physpage_p) {
5857                                 /* for vm_map_wire_and_extract() */
5858                                 *physpage_p = (ppnum_t)
5859                                     ((((vm_map_offset_t)
5860                                     object->vo_shadow_offset)
5861                                     + VME_OFFSET(entry)
5862                                     + (laddr - entry->vme_start))
5863                                     >> PAGE_SHIFT);
5864                         }
5865
5866                         if (caller_pmap) {
5867                                 /*
5868                                  * Set up a block mapped area
5869                                  */
5870                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5871                                 kr = pmap_map_block(caller_pmap,
5872                                     (addr64_t)(caller_pmap_addr - ldelta),
5873                                     (ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
5874                                     VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5875                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5876                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5877
5878                                 if (kr != KERN_SUCCESS) {
5879                                         goto cleanup;
5880                                 }
5881                         } else {
5882                                 /*
5883                                  * Set up a block mapped area
5884                                  */
5885                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5886                                 kr = pmap_map_block(real_map->pmap,
5887                                     (addr64_t)(vaddr - ldelta),
5888                                     (ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
5889                                     VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5890                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5891                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5892
5893                                 if (kr != KERN_SUCCESS) {
5894                                         goto cleanup;
5895                                 }
5896                         }
5897                 }
5898         }
5899
5900         /*
5901          * Success
5902          */
5903         kr = KERN_SUCCESS;
5904
5905         /*
5906          * TODO: could most of the done cases just use cleanup?
5907          */
5908 cleanup:
5909         /*
5910          * Unlock everything, and return
5911          */
5912         vm_map_unlock_read(map);
5913         if (real_map != map) {
5914                 vm_map_unlock(real_map);
5915         }
5916
5917         if (m != VM_PAGE_NULL) {
5918                 assert(VM_PAGE_OBJECT(m) == m_object);
5919
5920                 if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
5921                         vm_object_paging_begin(m_object);
5922
5923                         assert(written_on_object == VM_OBJECT_NULL);
5924                         written_on_object = m_object;
5925                         written_on_pager = m_object->pager;
5926                         written_on_offset = m_object->paging_offset + m->vmp_offset;
5927                 }
5928                 PAGE_WAKEUP_DONE(m);
5929
5930                 vm_fault_cleanup(m_object, top_page);
5931         } else {
5932                 vm_fault_cleanup(object, top_page);
5933         }
5934
5935         vm_object_deallocate(object);
5936
5937 #undef  RELEASE_PAGE
5938
5939 done:
5940         thread_interrupt_level(interruptible_state);
5941
5942         if (resilient_media_object != VM_OBJECT_NULL) {
5943                 assert(resilient_media_retry);
5944                 assert(resilient_media_offset != (vm_object_offset_t)-1);
5945                 /* release extra reference on failed object */
5946 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
5947                 vm_object_deallocate(resilient_media_object);
5948                 resilient_media_object = VM_OBJECT_NULL;
5949                 resilient_media_offset = (vm_object_offset_t)-1;
5950                 resilient_media_retry = FALSE;
5951         }
5952         assert(!resilient_media_retry);
5953
5954         /*
5955          * Only I/O throttle on faults which cause a pagein/swapin.
5956          */
5957         if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
5958                 throttle_lowpri_io(1);
5959         } else {
5960                 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
5961                         if ((throttle_delay = vm_page_throttled(TRUE))) {
5962                                 if (vm_debug_events) {
5963                                         if (type_of_fault == DBG_COMPRESSOR_FAULT) {
5964                                                 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5965                                         } else if (type_of_fault == DBG_COW_FAULT) {
5966                                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5967                                         } else {
5968                                                 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5969                                         }
5970                                 }
5971                                 delay(throttle_delay);
5972                         }
5973                 }
5974         }
5975
5976         if (written_on_object) {
5977                 vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
5978
5979                 vm_object_lock(written_on_object);
5980                 vm_object_paging_end(written_on_object);
5981                 vm_object_unlock(written_on_object);
5982
5983                 written_on_object = VM_OBJECT_NULL;
5984         }
5985
5986         if (rtfault) {
5987                 vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
5988         }
5989
5990         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5991             (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
5992             ((uint64_t)trace_vaddr >> 32),
5993             trace_vaddr,
5994             kr,
5995             vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
5996             0);
5997
5998         if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
5999                 DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
6000         }
6001
6002         return kr;
6003 }
6004
6005 /*
6006  *      vm_fault_wire:
6007  *
6008  *      Wire down a range of virtual addresses in a map.
6009  */
6010 kern_return_t
6011 vm_fault_wire(
6012         vm_map_t        map,
6013         vm_map_entry_t  entry,
6014         vm_prot_t       prot,
6015         vm_tag_t        wire_tag,
6016         pmap_t          pmap,
6017         vm_map_offset_t pmap_addr,
6018         ppnum_t         *physpage_p)
6019 {
6020         vm_map_offset_t va;
6021         vm_map_offset_t end_addr = entry->vme_end;
6022         kern_return_t   rc;
6023         vm_map_size_t   effective_page_size;
6024
6025         assert(entry->in_transition);
6026
6027         if ((VME_OBJECT(entry) != NULL) &&
6028             !entry->is_sub_map &&
6029             VME_OBJECT(entry)->phys_contiguous) {
6030                 return KERN_SUCCESS;
6031         }
6032
6033         /*
6034          *      Inform the physical mapping system that the
6035          *      range of addresses may not fault, so that
6036          *      page tables and such can be locked down as well.
6037          */
6038
6039         pmap_pageable(pmap, pmap_addr,
6040             pmap_addr + (end_addr - entry->vme_start), FALSE);
6041
6042         /*
6043          *      We simulate a fault to get the page and enter it
6044          *      in the physical map.
6045          */
6046
6047         effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6048         for (va = entry->vme_start;
6049             va < end_addr;
6050             va += effective_page_size) {
6051                 rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
6052                     pmap_addr + (va - entry->vme_start),
6053                     physpage_p);
6054                 if (rc != KERN_SUCCESS) {
6055                         rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
6056                             ((pmap == kernel_pmap)
6057                             ? THREAD_UNINT
6058                             : THREAD_ABORTSAFE),
6059                             pmap,
6060                             (pmap_addr +
6061                             (va - entry->vme_start)),
6062                             physpage_p);
6063                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
6064                 }
6065
6066                 if (rc != KERN_SUCCESS) {
6067                         struct vm_map_entry     tmp_entry = *entry;
6068
6069                         /* unwire wired pages */
6070                         tmp_entry.vme_end = va;
6071                         vm_fault_unwire(map,
6072                             &tmp_entry, FALSE, pmap, pmap_addr);
6073
6074                         return rc;
6075                 }
6076         }
6077         return KERN_SUCCESS;
6078 }
6079
6080 /*
6081  *      vm_fault_unwire:
6082  *
6083  *      Unwire a range of virtual addresses in a map.
6084  */
6085 void
6086 vm_fault_unwire(
6087         vm_map_t        map,
6088         vm_map_entry_t  entry,
6089         boolean_t       deallocate,
6090         pmap_t          pmap,
6091         vm_map_offset_t pmap_addr)
6092 {
6093         vm_map_offset_t va;
6094         vm_map_offset_t end_addr = entry->vme_end;
6095         vm_object_t             object;
6096         struct vm_object_fault_info fault_info = {};
6097         unsigned int    unwired_pages;
6098         vm_map_size_t   effective_page_size;
6099
6100         object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
6101
6102         /*
6103          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
6104          * do anything since such memory is wired by default.  So we don't have
6105          * anything to undo here.
6106          */
6107
6108         if (object != VM_OBJECT_NULL && object->phys_contiguous) {
6109                 return;
6110         }
6111
6112         fault_info.interruptible = THREAD_UNINT;
6113         fault_info.behavior = entry->behavior;
6114         fault_info.user_tag = VME_ALIAS(entry);
6115         if (entry->iokit_acct ||
6116             (!entry->is_sub_map && !entry->use_pmap)) {
6117                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6118         }
6119         fault_info.lo_offset = VME_OFFSET(entry);
6120         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
6121         fault_info.no_cache = entry->no_cache;
6122         fault_info.stealth = TRUE;
6123
6124         unwired_pages = 0;
6125
6126         /*
6127          *      Since the pages are wired down, we must be able to
6128          *      get their mappings from the physical map system.
6129          */
6130
6131         effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6132         for (va = entry->vme_start;
6133             va < end_addr;
6134             va += effective_page_size) {
6135                 if (object == VM_OBJECT_NULL) {
6136                         if (pmap) {
6137                                 pmap_change_wiring(pmap,
6138                                     pmap_addr + (va - entry->vme_start), FALSE);
6139                         }
6140                         (void) vm_fault(map, va, VM_PROT_NONE,
6141                             TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
6142                 } else {
6143                         vm_prot_t       prot;
6144                         vm_page_t       result_page;
6145                         vm_page_t       top_page;
6146                         vm_object_t     result_object;
6147                         vm_fault_return_t result;
6148
6149                         /* cap cluster size at maximum UPL size */
6150                         upl_size_t cluster_size;
6151                         if (os_sub_overflow(end_addr, va, &cluster_size)) {
6152                                 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6153                         }
6154                         fault_info.cluster_size = cluster_size;
6155
6156                         do {
6157                                 prot = VM_PROT_NONE;
6158
6159                                 vm_object_lock(object);
6160                                 vm_object_paging_begin(object);
6161                                 result_page = VM_PAGE_NULL;
6162                                 result = vm_fault_page(
6163                                         object,
6164                                         (VME_OFFSET(entry) +
6165                                         (va - entry->vme_start)),
6166                                         VM_PROT_NONE, TRUE,
6167                                         FALSE, /* page not looked up */
6168                                         &prot, &result_page, &top_page,
6169                                         (int *)0,
6170                                         NULL, map->no_zero_fill,
6171                                         FALSE, &fault_info);
6172                         } while (result == VM_FAULT_RETRY);
6173
6174                         /*
6175                          * If this was a mapping to a file on a device that has been forcibly
6176                          * unmounted, then we won't get a page back from vm_fault_page().  Just
6177                          * move on to the next one in case the remaining pages are mapped from
6178                          * different objects.  During a forced unmount, the object is terminated
6179                          * so the alive flag will be false if this happens.  A forced unmount will
6180                          * will occur when an external disk is unplugged before the user does an
6181                          * eject, so we don't want to panic in that situation.
6182                          */
6183
6184                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive) {
6185                                 continue;
6186                         }
6187
6188                         if (result == VM_FAULT_MEMORY_ERROR &&
6189                             object == kernel_object) {
6190                                 /*
6191                                  * This must have been allocated with
6192                                  * KMA_KOBJECT and KMA_VAONLY and there's
6193                                  * no physical page at this offset.
6194                                  * We're done (no page to free).
6195                                  */
6196                                 assert(deallocate);
6197                                 continue;
6198                         }
6199
6200                         if (result != VM_FAULT_SUCCESS) {
6201                                 panic("vm_fault_unwire: failure");
6202                         }
6203
6204                         result_object = VM_PAGE_OBJECT(result_page);
6205
6206                         if (deallocate) {
6207                                 assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
6208                                     vm_page_fictitious_addr);
6209                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
6210                                 if (VM_PAGE_WIRED(result_page)) {
6211                                         unwired_pages++;
6212                                 }
6213                                 VM_PAGE_FREE(result_page);
6214                         } else {
6215                                 if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
6216                                         pmap_change_wiring(pmap,
6217                                             pmap_addr + (va - entry->vme_start), FALSE);
6218                                 }
6219
6220
6221                                 if (VM_PAGE_WIRED(result_page)) {
6222                                         vm_page_lockspin_queues();
6223                                         vm_page_unwire(result_page, TRUE);
6224                                         vm_page_unlock_queues();
6225                                         unwired_pages++;
6226                                 }
6227                                 if (entry->zero_wired_pages) {
6228                                         pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
6229                                         entry->zero_wired_pages = FALSE;
6230                                 }
6231
6232                                 PAGE_WAKEUP_DONE(result_page);
6233                         }
6234                         vm_fault_cleanup(result_object, top_page);
6235                 }
6236         }
6237
6238         /*
6239          *      Inform the physical mapping system that the range
6240          *      of addresses may fault, so that page tables and
6241          *      such may be unwired themselves.
6242          */
6243
6244         pmap_pageable(pmap, pmap_addr,
6245             pmap_addr + (end_addr - entry->vme_start), TRUE);
6246
6247         if (kernel_object == object) {
6248                 /*
6249                  * Would like to make user_tag in vm_object_fault_info
6250                  * vm_tag_t (unsigned short) but user_tag derives its value from
6251                  * VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
6252                  * to an _unsigned int_ which is used by non-fault_info paths throughout the
6253                  * code at many places.
6254                  *
6255                  * So, for now, an explicit truncation to unsigned short (vm_tag_t).
6256                  */
6257                 assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
6258                     "VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
6259                 vm_tag_update_size((vm_tag_t) fault_info.user_tag, -ptoa_64(unwired_pages));
6260         }
6261 }
6262
6263 /*
6264  *      vm_fault_wire_fast:
6265  *
6266  *      Handle common case of a wire down page fault at the given address.
6267  *      If successful, the page is inserted into the associated physical map.
6268  *      The map entry is passed in to avoid the overhead of a map lookup.
6269  *
6270  *      NOTE: the given address should be truncated to the
6271  *      proper page address.
6272  *
6273  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
6274  *      a standard error specifying why the fault is fatal is returned.
6275  *
6276  *      The map in question must be referenced, and remains so.
6277  *      Caller has a read lock on the map.
6278  *
6279  *      This is a stripped version of vm_fault() for wiring pages.  Anything
6280  *      other than the common case will return KERN_FAILURE, and the caller
6281  *      is expected to call vm_fault().
6282  */
6283 static kern_return_t
6284 vm_fault_wire_fast(
6285         __unused vm_map_t       map,
6286         vm_map_offset_t va,
6287         __unused vm_prot_t       caller_prot,
6288         vm_tag_t        wire_tag,
6289         vm_map_entry_t  entry,
6290         pmap_t          pmap,
6291         vm_map_offset_t pmap_addr,
6292         ppnum_t         *physpage_p)
6293 {
6294         vm_object_t             object;
6295         vm_object_offset_t      offset;
6296         vm_page_t               m;
6297         vm_prot_t               prot;
6298         thread_t                thread = current_thread();
6299         int                     type_of_fault;
6300         kern_return_t           kr;
6301         vm_map_size_t           fault_page_size;
6302         vm_map_offset_t         fault_phys_offset;
6303         struct vm_object_fault_info fault_info = {};
6304
6305         VM_STAT_INCR(faults);
6306
6307         if (thread != THREAD_NULL && thread->task != TASK_NULL) {
6308                 thread->task->faults++;
6309         }
6310
6311 /*
6312  *      Recovery actions
6313  */
6314
6315 #undef  RELEASE_PAGE
6316 #define RELEASE_PAGE(m) {                               \
6317         PAGE_WAKEUP_DONE(m);                            \
6318         vm_page_lockspin_queues();                      \
6319         vm_page_unwire(m, TRUE);                        \
6320         vm_page_unlock_queues();                        \
6321 }
6322
6323
6324 #undef  UNLOCK_THINGS
6325 #define UNLOCK_THINGS   {                               \
6326         vm_object_paging_end(object);                      \
6327         vm_object_unlock(object);                          \
6328 }
6329
6330 #undef  UNLOCK_AND_DEALLOCATE
6331 #define UNLOCK_AND_DEALLOCATE   {                       \
6332         UNLOCK_THINGS;                                  \
6333         vm_object_deallocate(object);                   \
6334 }
6335 /*
6336  *      Give up and have caller do things the hard way.
6337  */
6338
6339 #define GIVE_UP {                                       \
6340         UNLOCK_AND_DEALLOCATE;                          \
6341         return(KERN_FAILURE);                           \
6342 }
6343
6344
6345         /*
6346          *      If this entry is not directly to a vm_object, bail out.
6347          */
6348         if (entry->is_sub_map) {
6349                 assert(physpage_p == NULL);
6350                 return KERN_FAILURE;
6351         }
6352
6353         /*
6354          *      Find the backing store object and offset into it.
6355          */
6356
6357         object = VME_OBJECT(entry);
6358         offset = (va - entry->vme_start) + VME_OFFSET(entry);
6359         prot = entry->protection;
6360
6361         /*
6362          *      Make a reference to this object to prevent its
6363          *      disposal while we are messing with it.
6364          */
6365
6366         vm_object_lock(object);
6367         vm_object_reference_locked(object);
6368         vm_object_paging_begin(object);
6369
6370         /*
6371          *      INVARIANTS (through entire routine):
6372          *
6373          *      1)      At all times, we must either have the object
6374          *              lock or a busy page in some object to prevent
6375          *              some other thread from trying to bring in
6376          *              the same page.
6377          *
6378          *      2)      Once we have a busy page, we must remove it from
6379          *              the pageout queues, so that the pageout daemon
6380          *              will not grab it away.
6381          *
6382          */
6383
6384         /*
6385          *      Look for page in top-level object.  If it's not there or
6386          *      there's something going on, give up.
6387          */
6388         m = vm_page_lookup(object, vm_object_trunc_page(offset));
6389         if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
6390             (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) {
6391                 GIVE_UP;
6392         }
6393         if (m->vmp_fictitious &&
6394             VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
6395                 /*
6396                  * Guard pages are fictitious pages and are never
6397                  * entered into a pmap, so let's say it's been wired...
6398                  */
6399                 kr = KERN_SUCCESS;
6400                 goto done;
6401         }
6402
6403         /*
6404          *      Wire the page down now.  All bail outs beyond this
6405          *      point must unwire the page.
6406          */
6407
6408         vm_page_lockspin_queues();
6409         vm_page_wire(m, wire_tag, TRUE);
6410         vm_page_unlock_queues();
6411
6412         /*
6413          *      Mark page busy for other threads.
6414          */
6415         assert(!m->vmp_busy);
6416         m->vmp_busy = TRUE;
6417         assert(!m->vmp_absent);
6418
6419         /*
6420          *      Give up if the page is being written and there's a copy object
6421          */
6422         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
6423                 RELEASE_PAGE(m);
6424                 GIVE_UP;
6425         }
6426
6427         fault_info.user_tag = VME_ALIAS(entry);
6428         fault_info.pmap_options = 0;
6429         if (entry->iokit_acct ||
6430             (!entry->is_sub_map && !entry->use_pmap)) {
6431                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6432         }
6433
6434         fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6435         fault_phys_offset = offset - vm_object_trunc_page(offset);
6436
6437         /*
6438          *      Put this page into the physical map.
6439          */
6440         type_of_fault = DBG_CACHE_HIT_FAULT;
6441         kr = vm_fault_enter(m,
6442             pmap,
6443             pmap_addr,
6444             fault_page_size,
6445             fault_phys_offset,
6446             prot,
6447             prot,
6448             TRUE,                  /* wired */
6449             FALSE,                 /* change_wiring */
6450             wire_tag,
6451             &fault_info,
6452             NULL,
6453             &type_of_fault);
6454         if (kr != KERN_SUCCESS) {
6455                 RELEASE_PAGE(m);
6456                 GIVE_UP;
6457         }
6458
6459 done:
6460         /*
6461          *      Unlock everything, and return
6462          */
6463
6464         if (physpage_p) {
6465                 /* for vm_map_wire_and_extract() */
6466                 if (kr == KERN_SUCCESS) {
6467                         assert(object == VM_PAGE_OBJECT(m));
6468                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6469                         if (prot & VM_PROT_WRITE) {
6470                                 vm_object_lock_assert_exclusive(object);
6471                                 m->vmp_dirty = TRUE;
6472                         }
6473                 } else {
6474                         *physpage_p = 0;
6475                 }
6476         }
6477
6478         PAGE_WAKEUP_DONE(m);
6479         UNLOCK_AND_DEALLOCATE;
6480
6481         return kr;
6482 }
6483
6484 /*
6485  *      Routine:        vm_fault_copy_cleanup
6486  *      Purpose:
6487  *              Release a page used by vm_fault_copy.
6488  */
6489
6490 static void
6491 vm_fault_copy_cleanup(
6492         vm_page_t       page,
6493         vm_page_t       top_page)
6494 {
6495         vm_object_t     object = VM_PAGE_OBJECT(page);
6496
6497         vm_object_lock(object);
6498         PAGE_WAKEUP_DONE(page);
6499         if (!VM_PAGE_PAGEABLE(page)) {
6500                 vm_page_lockspin_queues();
6501                 if (!VM_PAGE_PAGEABLE(page)) {
6502                         vm_page_activate(page);
6503                 }
6504                 vm_page_unlock_queues();
6505         }
6506         vm_fault_cleanup(object, top_page);
6507 }
6508
6509 static void
6510 vm_fault_copy_dst_cleanup(
6511         vm_page_t       page)
6512 {
6513         vm_object_t     object;
6514
6515         if (page != VM_PAGE_NULL) {
6516                 object = VM_PAGE_OBJECT(page);
6517                 vm_object_lock(object);
6518                 vm_page_lockspin_queues();
6519                 vm_page_unwire(page, TRUE);
6520                 vm_page_unlock_queues();
6521                 vm_object_paging_end(object);
6522                 vm_object_unlock(object);
6523         }
6524 }
6525
6526 /*
6527  *      Routine:        vm_fault_copy
6528  *
6529  *      Purpose:
6530  *              Copy pages from one virtual memory object to another --
6531  *              neither the source nor destination pages need be resident.
6532  *
6533  *              Before actually copying a page, the version associated with
6534  *              the destination address map wil be verified.
6535  *
6536  *      In/out conditions:
6537  *              The caller must hold a reference, but not a lock, to
6538  *              each of the source and destination objects and to the
6539  *              destination map.
6540  *
6541  *      Results:
6542  *              Returns KERN_SUCCESS if no errors were encountered in
6543  *              reading or writing the data.  Returns KERN_INTERRUPTED if
6544  *              the operation was interrupted (only possible if the
6545  *              "interruptible" argument is asserted).  Other return values
6546  *              indicate a permanent error in copying the data.
6547  *
6548  *              The actual amount of data copied will be returned in the
6549  *              "copy_size" argument.  In the event that the destination map
6550  *              verification failed, this amount may be less than the amount
6551  *              requested.
6552  */
6553 kern_return_t
6554 vm_fault_copy(
6555         vm_object_t             src_object,
6556         vm_object_offset_t      src_offset,
6557         vm_map_size_t           *copy_size,             /* INOUT */
6558         vm_object_t             dst_object,
6559         vm_object_offset_t      dst_offset,
6560         vm_map_t                dst_map,
6561         vm_map_version_t         *dst_version,
6562         int                     interruptible)
6563 {
6564         vm_page_t               result_page;
6565
6566         vm_page_t               src_page;
6567         vm_page_t               src_top_page;
6568         vm_prot_t               src_prot;
6569
6570         vm_page_t               dst_page;
6571         vm_page_t               dst_top_page;
6572         vm_prot_t               dst_prot;
6573
6574         vm_map_size_t           amount_left;
6575         vm_object_t             old_copy_object;
6576         vm_object_t             result_page_object = NULL;
6577         kern_return_t           error = 0;
6578         vm_fault_return_t       result;
6579
6580         vm_map_size_t           part_size;
6581         struct vm_object_fault_info fault_info_src = {};
6582         struct vm_object_fault_info fault_info_dst = {};
6583
6584         /*
6585          * In order not to confuse the clustered pageins, align
6586          * the different offsets on a page boundary.
6587          */
6588
6589 #define RETURN(x)                                       \
6590         MACRO_BEGIN                                     \
6591         *copy_size -= amount_left;                      \
6592         MACRO_RETURN(x);                                \
6593         MACRO_END
6594
6595         amount_left = *copy_size;
6596
6597         fault_info_src.interruptible = interruptible;
6598         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
6599         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
6600         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
6601         fault_info_src.stealth = TRUE;
6602
6603         fault_info_dst.interruptible = interruptible;
6604         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
6605         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
6606         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
6607         fault_info_dst.stealth = TRUE;
6608
6609         do { /* while (amount_left > 0) */
6610                 /*
6611                  * There may be a deadlock if both source and destination
6612                  * pages are the same. To avoid this deadlock, the copy must
6613                  * start by getting the destination page in order to apply
6614                  * COW semantics if any.
6615                  */
6616
6617 RetryDestinationFault:;
6618
6619                 dst_prot = VM_PROT_WRITE | VM_PROT_READ;
6620
6621                 vm_object_lock(dst_object);
6622                 vm_object_paging_begin(dst_object);
6623
6624                 /* cap cluster size at maximum UPL size */
6625                 upl_size_t cluster_size;
6626                 if (os_convert_overflow(amount_left, &cluster_size)) {
6627                         cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6628                 }
6629                 fault_info_dst.cluster_size = cluster_size;
6630
6631                 dst_page = VM_PAGE_NULL;
6632                 result = vm_fault_page(dst_object,
6633                     vm_object_trunc_page(dst_offset),
6634                     VM_PROT_WRITE | VM_PROT_READ,
6635                     FALSE,
6636                     FALSE,                    /* page not looked up */
6637                     &dst_prot, &dst_page, &dst_top_page,
6638                     (int *)0,
6639                     &error,
6640                     dst_map->no_zero_fill,
6641                     FALSE, &fault_info_dst);
6642                 switch (result) {
6643                 case VM_FAULT_SUCCESS:
6644                         break;
6645                 case VM_FAULT_RETRY:
6646                         goto RetryDestinationFault;
6647                 case VM_FAULT_MEMORY_SHORTAGE:
6648                         if (vm_page_wait(interruptible)) {
6649                                 goto RetryDestinationFault;
6650                         }
6651                         OS_FALLTHROUGH;
6652                 case VM_FAULT_INTERRUPTED:
6653                         RETURN(MACH_SEND_INTERRUPTED);
6654                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
6655                         /* success but no VM page: fail the copy */
6656                         vm_object_paging_end(dst_object);
6657                         vm_object_unlock(dst_object);
6658                         OS_FALLTHROUGH;
6659                 case VM_FAULT_MEMORY_ERROR:
6660                         if (error) {
6661                                 return error;
6662                         } else {
6663                                 return KERN_MEMORY_ERROR;
6664                         }
6665                 default:
6666                         panic("vm_fault_copy: unexpected error 0x%x from "
6667                             "vm_fault_page()\n", result);
6668                 }
6669                 assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
6670
6671                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
6672                 old_copy_object = dst_object->copy;
6673
6674                 /*
6675                  * There exists the possiblity that the source and
6676                  * destination page are the same.  But we can't
6677                  * easily determine that now.  If they are the
6678                  * same, the call to vm_fault_page() for the
6679                  * destination page will deadlock.  To prevent this we
6680                  * wire the page so we can drop busy without having
6681                  * the page daemon steal the page.  We clean up the
6682                  * top page  but keep the paging reference on the object
6683                  * holding the dest page so it doesn't go away.
6684                  */
6685
6686                 vm_page_lockspin_queues();
6687                 vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
6688                 vm_page_unlock_queues();
6689                 PAGE_WAKEUP_DONE(dst_page);
6690                 vm_object_unlock(dst_object);
6691
6692                 if (dst_top_page != VM_PAGE_NULL) {
6693                         vm_object_lock(dst_object);
6694                         VM_PAGE_FREE(dst_top_page);
6695                         vm_object_paging_end(dst_object);
6696                         vm_object_unlock(dst_object);
6697                 }
6698
6699 RetrySourceFault:;
6700
6701                 if (src_object == VM_OBJECT_NULL) {
6702                         /*
6703                          *      No source object.  We will just
6704                          *      zero-fill the page in dst_object.
6705                          */
6706                         src_page = VM_PAGE_NULL;
6707                         result_page = VM_PAGE_NULL;
6708                 } else {
6709                         vm_object_lock(src_object);
6710                         src_page = vm_page_lookup(src_object,
6711                             vm_object_trunc_page(src_offset));
6712                         if (src_page == dst_page) {
6713                                 src_prot = dst_prot;
6714                                 result_page = VM_PAGE_NULL;
6715                         } else {
6716                                 src_prot = VM_PROT_READ;
6717                                 vm_object_paging_begin(src_object);
6718
6719                                 /* cap cluster size at maximum UPL size */
6720                                 if (os_convert_overflow(amount_left, &cluster_size)) {
6721                                         cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6722                                 }
6723                                 fault_info_src.cluster_size = cluster_size;
6724
6725                                 result_page = VM_PAGE_NULL;
6726                                 result = vm_fault_page(
6727                                         src_object,
6728                                         vm_object_trunc_page(src_offset),
6729                                         VM_PROT_READ, FALSE,
6730                                         FALSE, /* page not looked up */
6731                                         &src_prot,
6732                                         &result_page, &src_top_page,
6733                                         (int *)0, &error, FALSE,
6734                                         FALSE, &fault_info_src);
6735
6736                                 switch (result) {
6737                                 case VM_FAULT_SUCCESS:
6738                                         break;
6739                                 case VM_FAULT_RETRY:
6740                                         goto RetrySourceFault;
6741                                 case VM_FAULT_MEMORY_SHORTAGE:
6742                                         if (vm_page_wait(interruptible)) {
6743                                                 goto RetrySourceFault;
6744                                         }
6745                                         OS_FALLTHROUGH;
6746                                 case VM_FAULT_INTERRUPTED:
6747                                         vm_fault_copy_dst_cleanup(dst_page);
6748                                         RETURN(MACH_SEND_INTERRUPTED);
6749                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
6750                                         /* success but no VM page: fail */
6751                                         vm_object_paging_end(src_object);
6752                                         vm_object_unlock(src_object);
6753                                         OS_FALLTHROUGH;
6754                                 case VM_FAULT_MEMORY_ERROR:
6755                                         vm_fault_copy_dst_cleanup(dst_page);
6756                                         if (error) {
6757                                                 return error;
6758                                         } else {
6759                                                 return KERN_MEMORY_ERROR;
6760                                         }
6761                                 default:
6762                                         panic("vm_fault_copy(2): unexpected "
6763                                             "error 0x%x from "
6764                                             "vm_fault_page()\n", result);
6765                                 }
6766
6767                                 result_page_object = VM_PAGE_OBJECT(result_page);
6768                                 assert((src_top_page == VM_PAGE_NULL) ==
6769                                     (result_page_object == src_object));
6770                         }
6771                         assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
6772                         vm_object_unlock(result_page_object);
6773                 }
6774
6775                 vm_map_lock_read(dst_map);
6776
6777                 if (!vm_map_verify(dst_map, dst_version)) {
6778                         vm_map_unlock_read(dst_map);
6779                         if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6780                                 vm_fault_copy_cleanup(result_page, src_top_page);
6781                         }
6782                         vm_fault_copy_dst_cleanup(dst_page);
6783                         break;
6784                 }
6785                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
6786
6787                 vm_object_lock(dst_object);
6788
6789                 if (dst_object->copy != old_copy_object) {
6790                         vm_object_unlock(dst_object);
6791                         vm_map_unlock_read(dst_map);
6792                         if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6793                                 vm_fault_copy_cleanup(result_page, src_top_page);
6794                         }
6795                         vm_fault_copy_dst_cleanup(dst_page);
6796                         break;
6797                 }
6798                 vm_object_unlock(dst_object);
6799
6800                 /*
6801                  *      Copy the page, and note that it is dirty
6802                  *      immediately.
6803                  */
6804
6805                 if (!page_aligned(src_offset) ||
6806                     !page_aligned(dst_offset) ||
6807                     !page_aligned(amount_left)) {
6808                         vm_object_offset_t      src_po,
6809                             dst_po;
6810
6811                         src_po = src_offset - vm_object_trunc_page(src_offset);
6812                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
6813
6814                         if (dst_po > src_po) {
6815                                 part_size = PAGE_SIZE - dst_po;
6816                         } else {
6817                                 part_size = PAGE_SIZE - src_po;
6818                         }
6819                         if (part_size > (amount_left)) {
6820                                 part_size = amount_left;
6821                         }
6822
6823                         if (result_page == VM_PAGE_NULL) {
6824                                 assert((vm_offset_t) dst_po == dst_po);
6825                                 assert((vm_size_t) part_size == part_size);
6826                                 vm_page_part_zero_fill(dst_page,
6827                                     (vm_offset_t) dst_po,
6828                                     (vm_size_t) part_size);
6829                         } else {
6830                                 assert((vm_offset_t) src_po == src_po);
6831                                 assert((vm_offset_t) dst_po == dst_po);
6832                                 assert((vm_size_t) part_size == part_size);
6833                                 vm_page_part_copy(result_page,
6834                                     (vm_offset_t) src_po,
6835                                     dst_page,
6836                                     (vm_offset_t) dst_po,
6837                                     (vm_size_t)part_size);
6838                                 if (!dst_page->vmp_dirty) {
6839                                         vm_object_lock(dst_object);
6840                                         SET_PAGE_DIRTY(dst_page, TRUE);
6841                                         vm_object_unlock(dst_object);
6842                                 }
6843                         }
6844                 } else {
6845                         part_size = PAGE_SIZE;
6846
6847                         if (result_page == VM_PAGE_NULL) {
6848                                 vm_page_zero_fill(dst_page);
6849                         } else {
6850                                 vm_object_lock(result_page_object);
6851                                 vm_page_copy(result_page, dst_page);
6852                                 vm_object_unlock(result_page_object);
6853
6854                                 if (!dst_page->vmp_dirty) {
6855                                         vm_object_lock(dst_object);
6856                                         SET_PAGE_DIRTY(dst_page, TRUE);
6857                                         vm_object_unlock(dst_object);
6858                                 }
6859                         }
6860                 }
6861
6862                 /*
6863                  *      Unlock everything, and return
6864                  */
6865
6866                 vm_map_unlock_read(dst_map);
6867
6868                 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6869                         vm_fault_copy_cleanup(result_page, src_top_page);
6870                 }
6871                 vm_fault_copy_dst_cleanup(dst_page);
6872
6873                 amount_left -= part_size;
6874                 src_offset += part_size;
6875                 dst_offset += part_size;
6876         } while (amount_left > 0);
6877
6878         RETURN(KERN_SUCCESS);
6879 #undef  RETURN
6880
6881         /*NOTREACHED*/
6882 }
6883
6884 #if     VM_FAULT_CLASSIFY
6885 /*
6886  *      Temporary statistics gathering support.
6887  */
6888
6889 /*
6890  *      Statistics arrays:
6891  */
6892 #define VM_FAULT_TYPES_MAX      5
6893 #define VM_FAULT_LEVEL_MAX      8
6894
6895 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
6896
6897 #define VM_FAULT_TYPE_ZERO_FILL 0
6898 #define VM_FAULT_TYPE_MAP_IN    1
6899 #define VM_FAULT_TYPE_PAGER     2
6900 #define VM_FAULT_TYPE_COPY      3
6901 #define VM_FAULT_TYPE_OTHER     4
6902
6903
6904 void
6905 vm_fault_classify(vm_object_t           object,
6906     vm_object_offset_t    offset,
6907     vm_prot_t             fault_type)
6908 {
6909         int             type, level = 0;
6910         vm_page_t       m;
6911
6912         while (TRUE) {
6913                 m = vm_page_lookup(object, offset);
6914                 if (m != VM_PAGE_NULL) {
6915                         if (m->vmp_busy || m->vmp_error || m->vmp_restart || m->vmp_absent) {
6916                                 type = VM_FAULT_TYPE_OTHER;
6917                                 break;
6918                         }
6919                         if (((fault_type & VM_PROT_WRITE) == 0) ||
6920                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
6921                                 type = VM_FAULT_TYPE_MAP_IN;
6922                                 break;
6923                         }
6924                         type = VM_FAULT_TYPE_COPY;
6925                         break;
6926                 } else {
6927                         if (object->pager_created) {
6928                                 type = VM_FAULT_TYPE_PAGER;
6929                                 break;
6930                         }
6931                         if (object->shadow == VM_OBJECT_NULL) {
6932                                 type = VM_FAULT_TYPE_ZERO_FILL;
6933                                 break;
6934                         }
6935
6936                         offset += object->vo_shadow_offset;
6937                         object = object->shadow;
6938                         level++;
6939                         continue;
6940                 }
6941         }
6942
6943         if (level > VM_FAULT_LEVEL_MAX) {
6944                 level = VM_FAULT_LEVEL_MAX;
6945         }
6946
6947         vm_fault_stats[type][level] += 1;
6948
6949         return;
6950 }
6951
6952 /* cleanup routine to call from debugger */
6953
6954 void
6955 vm_fault_classify_init(void)
6956 {
6957         int type, level;
6958
6959         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
6960                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
6961                         vm_fault_stats[type][level] = 0;
6962                 }
6963         }
6964
6965         return;
6966 }
6967 #endif  /* VM_FAULT_CLASSIFY */
6968
6969 vm_offset_t
6970 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
6971 {
6972         vm_map_entry_t  entry;
6973         vm_object_t     object;
6974         vm_offset_t     object_offset;
6975         vm_page_t       m;
6976         int             compressor_external_state, compressed_count_delta;
6977         int             compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
6978         int             my_fault_type = VM_PROT_READ;
6979         kern_return_t   kr;
6980         int effective_page_mask, effective_page_size;
6981
6982         if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6983                 effective_page_mask = VM_MAP_PAGE_MASK(map);
6984                 effective_page_size = VM_MAP_PAGE_SIZE(map);
6985         } else {
6986                 effective_page_mask = PAGE_MASK;
6987                 effective_page_size = PAGE_SIZE;
6988         }
6989
6990         if (not_in_kdp) {
6991                 panic("kdp_lightweight_fault called from outside of debugger context");
6992         }
6993
6994         assert(map != VM_MAP_NULL);
6995
6996         assert((cur_target_addr & effective_page_mask) == 0);
6997         if ((cur_target_addr & effective_page_mask) != 0) {
6998                 return 0;
6999         }
7000
7001         if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
7002                 return 0;
7003         }
7004
7005         if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
7006                 return 0;
7007         }
7008
7009         if (entry->is_sub_map) {
7010                 return 0;
7011         }
7012
7013         object = VME_OBJECT(entry);
7014         if (object == VM_OBJECT_NULL) {
7015                 return 0;
7016         }
7017
7018         object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
7019
7020         while (TRUE) {
7021                 if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
7022                         return 0;
7023                 }
7024
7025                 if (object->pager_created && (object->paging_in_progress ||
7026                     object->activity_in_progress)) {
7027                         return 0;
7028                 }
7029
7030                 m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
7031
7032                 if (m != VM_PAGE_NULL) {
7033                         if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
7034                                 return 0;
7035                         }
7036
7037                         if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || m->vmp_error || m->vmp_cleaning ||
7038                             m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
7039                                 return 0;
7040                         }
7041
7042                         assert(!m->vmp_private);
7043                         if (m->vmp_private) {
7044                                 return 0;
7045                         }
7046
7047                         assert(!m->vmp_fictitious);
7048                         if (m->vmp_fictitious) {
7049                                 return 0;
7050                         }
7051
7052                         assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7053                         if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7054                                 return 0;
7055                         }
7056
7057                         return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
7058                 }
7059
7060                 compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
7061
7062                 if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
7063                         if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
7064                                 kr = vm_compressor_pager_get(object->pager,
7065                                     vm_object_trunc_page(object_offset + object->paging_offset),
7066                                     kdp_compressor_decompressed_page_ppnum, &my_fault_type,
7067                                     compressor_flags, &compressed_count_delta);
7068                                 if (kr == KERN_SUCCESS) {
7069                                         return kdp_compressor_decompressed_page_paddr;
7070                                 } else {
7071                                         return 0;
7072                                 }
7073                         }
7074                 }
7075
7076                 if (object->shadow == VM_OBJECT_NULL) {
7077                         return 0;
7078                 }
7079
7080                 object_offset += object->vo_shadow_offset;
7081                 object = object->shadow;
7082         }
7083 }
7084
7085 /*
7086  * vm_page_validate_cs_fast():
7087  * Performs a few quick checks to determine if the page's code signature
7088  * really needs to be fully validated.  It could:
7089  *      1. have been modified (i.e. automatically tainted),
7090  *      2. have already been validated,
7091  *      3. have already been found to be tainted,
7092  *      4. no longer have a backing store.
7093  * Returns FALSE if the page needs to be fully validated.
7094  */
7095 static boolean_t
7096 vm_page_validate_cs_fast(
7097         vm_page_t       page,
7098         vm_map_size_t   fault_page_size,
7099         vm_map_offset_t fault_phys_offset)
7100 {
7101         vm_object_t     object;
7102
7103         object = VM_PAGE_OBJECT(page);
7104         vm_object_lock_assert_held(object);
7105
7106         if (page->vmp_wpmapped &&
7107             !VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7108                 /*
7109                  * This page was mapped for "write" access sometime in the
7110                  * past and could still be modifiable in the future.
7111                  * Consider it tainted.
7112                  * [ If the page was already found to be "tainted", no
7113                  * need to re-validate. ]
7114                  */
7115                 vm_object_lock_assert_exclusive(object);
7116                 VMP_CS_SET_VALIDATED(page, fault_page_size, fault_phys_offset, TRUE);
7117                 VMP_CS_SET_TAINTED(page, fault_page_size, fault_phys_offset, TRUE);
7118                 if (cs_debug) {
7119                         printf("CODESIGNING: %s: "
7120                             "page %p obj %p off 0x%llx "
7121                             "was modified\n",
7122                             __FUNCTION__,
7123                             page, object, page->vmp_offset);
7124                 }
7125                 vm_cs_validated_dirtied++;
7126         }
7127
7128         if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) ||
7129             VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7130                 return TRUE;
7131         }
7132         vm_object_lock_assert_exclusive(object);
7133
7134 #if CHECK_CS_VALIDATION_BITMAP
7135         kern_return_t kr;
7136
7137         kr = vnode_pager_cs_check_validation_bitmap(
7138                 object->pager,
7139                 page->vmp_offset + object->paging_offset,
7140                 CS_BITMAP_CHECK);
7141         if (kr == KERN_SUCCESS) {
7142                 page->vmp_cs_validated = VMP_CS_ALL_TRUE;
7143                 page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
7144                 vm_cs_bitmap_validated++;
7145                 return TRUE;
7146         }
7147 #endif /* CHECK_CS_VALIDATION_BITMAP */
7148
7149         if (!object->alive || object->terminating || object->pager == NULL) {
7150                 /*
7151                  * The object is terminating and we don't have its pager
7152                  * so we can't validate the data...
7153                  */
7154                 return TRUE;
7155         }
7156
7157         /* we need to really validate this page */
7158         vm_object_lock_assert_exclusive(object);
7159         return FALSE;
7160 }
7161
7162 void
7163 vm_page_validate_cs_mapped_slow(
7164         vm_page_t       page,
7165         const void      *kaddr)
7166 {
7167         vm_object_t             object;
7168         memory_object_offset_t  mo_offset;
7169         memory_object_t         pager;
7170         struct vnode            *vnode;
7171         int                     validated, tainted, nx;
7172
7173         assert(page->vmp_busy);
7174         object = VM_PAGE_OBJECT(page);
7175         vm_object_lock_assert_exclusive(object);
7176
7177         vm_cs_validates++;
7178
7179         /*
7180          * Since we get here to validate a page that was brought in by
7181          * the pager, we know that this pager is all setup and ready
7182          * by now.
7183          */
7184         assert(object->code_signed);
7185         assert(!object->internal);
7186         assert(object->pager != NULL);
7187         assert(object->pager_ready);
7188
7189         pager = object->pager;
7190         assert(object->paging_in_progress);
7191         vnode = vnode_pager_lookup_vnode(pager);
7192         mo_offset = page->vmp_offset + object->paging_offset;
7193
7194         /* verify the SHA1 hash for this page */
7195         validated = 0;
7196         tainted = 0;
7197         nx = 0;
7198         cs_validate_page(vnode,
7199             pager,
7200             mo_offset,
7201             (const void *)((const char *)kaddr),
7202             &validated,
7203             &tainted,
7204             &nx);
7205
7206         page->vmp_cs_validated |= validated;
7207         page->vmp_cs_tainted |= tainted;
7208         page->vmp_cs_nx |= nx;
7209
7210 #if CHECK_CS_VALIDATION_BITMAP
7211         if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
7212             page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
7213                 vnode_pager_cs_check_validation_bitmap(object->pager,
7214                     mo_offset,
7215                     CS_BITMAP_SET);
7216         }
7217 #endif /* CHECK_CS_VALIDATION_BITMAP */
7218 }
7219
7220 void
7221 vm_page_validate_cs_mapped(
7222         vm_page_t       page,
7223         vm_map_size_t   fault_page_size,
7224         vm_map_offset_t fault_phys_offset,
7225         const void      *kaddr)
7226 {
7227         if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7228                 vm_page_validate_cs_mapped_slow(page, kaddr);
7229         }
7230 }
7231
7232 void
7233 vm_page_validate_cs(
7234         vm_page_t       page,
7235         vm_map_size_t   fault_page_size,
7236         vm_map_offset_t fault_phys_offset)
7237 {
7238         vm_object_t             object;
7239         vm_object_offset_t      offset;
7240         vm_map_offset_t         koffset;
7241         vm_map_size_t           ksize;
7242         vm_offset_t             kaddr;
7243         kern_return_t           kr;
7244         boolean_t               busy_page;
7245         boolean_t               need_unmap;
7246
7247         object = VM_PAGE_OBJECT(page);
7248         vm_object_lock_assert_held(object);
7249
7250         if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7251                 return;
7252         }
7253         vm_object_lock_assert_exclusive(object);
7254
7255         assert(object->code_signed);
7256         offset = page->vmp_offset;
7257
7258         busy_page = page->vmp_busy;
7259         if (!busy_page) {
7260                 /* keep page busy while we map (and unlock) the VM object */
7261                 page->vmp_busy = TRUE;
7262         }
7263
7264         /*
7265          * Take a paging reference on the VM object
7266          * to protect it from collapse or bypass,
7267          * and keep it from disappearing too.
7268          */
7269         vm_object_paging_begin(object);
7270
7271         /* map the page in the kernel address space */
7272         ksize = PAGE_SIZE_64;
7273         koffset = 0;
7274         need_unmap = FALSE;
7275         kr = vm_paging_map_object(page,
7276             object,
7277             offset,
7278             VM_PROT_READ,
7279             FALSE,                       /* can't unlock object ! */
7280             &ksize,
7281             &koffset,
7282             &need_unmap);
7283         if (kr != KERN_SUCCESS) {
7284                 panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
7285         }
7286         kaddr = CAST_DOWN(vm_offset_t, koffset);
7287
7288         /* validate the mapped page */
7289         vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
7290
7291         assert(page->vmp_busy);
7292         assert(object == VM_PAGE_OBJECT(page));
7293         vm_object_lock_assert_exclusive(object);
7294
7295         if (!busy_page) {
7296                 PAGE_WAKEUP_DONE(page);
7297         }
7298         if (need_unmap) {
7299                 /* unmap the map from the kernel address space */
7300                 vm_paging_unmap_object(object, koffset, koffset + ksize);
7301                 koffset = 0;
7302                 ksize = 0;
7303                 kaddr = 0;
7304         }
7305         vm_object_paging_end(object);
7306 }
7307
7308 void
7309 vm_page_validate_cs_mapped_chunk(
7310         vm_page_t       page,
7311         const void      *kaddr,
7312         vm_offset_t     chunk_offset,
7313         vm_size_t       chunk_size,
7314         boolean_t       *validated_p,
7315         unsigned        *tainted_p)
7316 {
7317         vm_object_t             object;
7318         vm_object_offset_t      offset, offset_in_page;
7319         memory_object_t         pager;
7320         struct vnode            *vnode;
7321         boolean_t               validated;
7322         unsigned                tainted;
7323
7324         *validated_p = FALSE;
7325         *tainted_p = 0;
7326
7327         assert(page->vmp_busy);
7328         object = VM_PAGE_OBJECT(page);
7329         vm_object_lock_assert_exclusive(object);
7330
7331         assert(object->code_signed);
7332         offset = page->vmp_offset;
7333
7334         if (!object->alive || object->terminating || object->pager == NULL) {
7335                 /*
7336                  * The object is terminating and we don't have its pager
7337                  * so we can't validate the data...
7338                  */
7339                 return;
7340         }
7341         /*
7342          * Since we get here to validate a page that was brought in by
7343          * the pager, we know that this pager is all setup and ready
7344          * by now.
7345          */
7346         assert(!object->internal);
7347         assert(object->pager != NULL);
7348         assert(object->pager_ready);
7349
7350         pager = object->pager;
7351         assert(object->paging_in_progress);
7352         vnode = vnode_pager_lookup_vnode(pager);
7353
7354         /* verify the signature for this chunk */
7355         offset_in_page = chunk_offset;
7356         assert(offset_in_page < PAGE_SIZE);
7357
7358         tainted = 0;
7359         validated = cs_validate_range(vnode,
7360             pager,
7361             (object->paging_offset +
7362             offset +
7363             offset_in_page),
7364             (const void *)((const char *)kaddr
7365             + offset_in_page),
7366             chunk_size,
7367             &tainted);
7368         if (validated) {
7369                 *validated_p = TRUE;
7370         }
7371         if (tainted) {
7372                 *tainted_p = tainted;
7373         }
7374 }
7375
7376 static void
7377 vm_rtfrecord_lock(void)
7378 {
7379         lck_spin_lock(&vm_rtfr_slock);
7380 }
7381
7382 static void
7383 vm_rtfrecord_unlock(void)
7384 {
7385         lck_spin_unlock(&vm_rtfr_slock);
7386 }
7387
7388 unsigned int
7389 vmrtfaultinfo_bufsz(void)
7390 {
7391         return vmrtf_num_records * sizeof(vm_rtfault_record_t);
7392 }
7393
7394 #include <kern/backtrace.h>
7395
7396 __attribute__((noinline))
7397 static void
7398 vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
7399 {
7400         uint64_t fend = mach_continuous_time();
7401
7402         uint64_t cfpc = 0;
7403         uint64_t ctid = cthread->thread_id;
7404         uint64_t cupid = get_current_unique_pid();
7405
7406         uintptr_t bpc = 0;
7407         int btr = 0;
7408         bool u64 = false;
7409
7410         /* Capture a single-frame backtrace; this extracts just the program
7411          * counter at the point of the fault into "bpc", and should perform no
7412          * further user stack traversals, thus avoiding copyin()s and further
7413          * faults.
7414          */
7415         unsigned int bfrs = backtrace_thread_user(cthread, &bpc, 1U, &btr, &u64, NULL, false);
7416
7417         if ((btr == 0) && (bfrs > 0)) {
7418                 cfpc = bpc;
7419         }
7420
7421         assert((fstart != 0) && fend >= fstart);
7422         vm_rtfrecord_lock();
7423         assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
7424
7425         vmrtfrs.vmrtf_total++;
7426         vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
7427
7428         cvmr->rtfabstime = fstart;
7429         cvmr->rtfduration = fend - fstart;
7430         cvmr->rtfaddr = fault_vaddr;
7431         cvmr->rtfpc = cfpc;
7432         cvmr->rtftype = type_of_fault;
7433         cvmr->rtfupid = cupid;
7434         cvmr->rtftid = ctid;
7435
7436         if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
7437                 vmrtfrs.vmrtfr_curi = 0;
7438         }
7439
7440         vm_rtfrecord_unlock();
7441 }
7442
7443 int
7444 vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void *vrecords, unsigned long *vmrtfrv)
7445 {
7446         vm_rtfault_record_t *cvmrd = vrecords;
7447         size_t residue = vrecordsz;
7448         size_t numextracted = 0;
7449         boolean_t early_exit = FALSE;
7450
7451         vm_rtfrecord_lock();
7452
7453         for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
7454                 if (residue < sizeof(vm_rtfault_record_t)) {
7455                         early_exit = TRUE;
7456                         break;
7457                 }
7458
7459                 if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
7460 #if     DEVELOPMENT || DEBUG
7461                         if (isroot == FALSE) {
7462                                 continue;
7463                         }
7464 #else
7465                         continue;
7466 #endif /* DEVDEBUG */
7467                 }
7468
7469                 *cvmrd = vmrtfrs.vm_rtf_records[vmfi];
7470                 cvmrd++;
7471                 residue -= sizeof(vm_rtfault_record_t);
7472                 numextracted++;
7473         }
7474
7475         vm_rtfrecord_unlock();
7476
7477         *vmrtfrv = numextracted;
7478         return early_exit;
7479 }