osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/mach_param.h>
  86 #include <kern/macro_help.h>
  87 #include <kern/zalloc.h>
  88 #include <kern/misc_protos.h>
  89 #include <kern/policy_internal.h>
  90
  91 #include <vm/vm_compressor.h>
  92 #include <vm/vm_compressor_pager.h>
  93 #include <vm/vm_fault.h>
  94 #include <vm/vm_map.h>
  95 #include <vm/vm_object.h>
  96 #include <vm/vm_page.h>
  97 #include <vm/vm_kern.h>
  98 #include <vm/pmap.h>
  99 #include <vm/vm_pageout.h>
 100 #include <vm/vm_protos.h>
 101 #include <vm/vm_external.h>
 102 #include <vm/memory_object.h>
 103 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 104 #include <vm/vm_shared_region.h>
 105
 106 #include <sys/codesign.h>
 107 #include <sys/reason.h>
 108 #include <sys/signalvar.h>
 109
 110 #include <san/kasan.h>
 111
 112 #define VM_FAULT_CLASSIFY       0
 113
 114 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 115
 116 int vm_protect_privileged_from_untrusted = 1;
 117
 118 unsigned int    vm_object_pagein_throttle = 16;
 119
 120 /*
 121  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 122  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 123  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 124  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 125  * keep the UI active so that the user has a chance to kill the offending task before the system
 126  * completely hangs.
 127  *
 128  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 129  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 130  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 131  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 132  */
 133
 134 extern void throttle_lowpri_io(int);
 135
 136 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
 137
 138 uint64_t vm_hard_throttle_threshold;
 139
 140
 141
 142 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (vm_wants_task_throttled(current_task()) ||     \
 143                                                  ((vm_page_free_count < vm_page_throttle_limit || \
 144                                                    HARD_THROTTLE_LIMIT_REACHED()) && \
 145                                                   proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
 146
 147
 148 #define HARD_THROTTLE_DELAY     10000   /* 10000 us == 10 ms */
 149 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
 150
 151 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
 152 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
 153
 154
 155 #define VM_STAT_DECOMPRESSIONS()        \
 156 MACRO_BEGIN                             \
 157         VM_STAT_INCR(decompressions);       \
 158         current_thread()->decompressions++; \
 159 MACRO_END
 160
 161 boolean_t current_thread_aborted(void);
 162
 163 /* Forward declarations of internal routines. */
 164 static kern_return_t vm_fault_wire_fast(
 165         vm_map_t        map,
 166         vm_map_offset_t va,
 167         vm_prot_t       prot,
 168         vm_tag_t        wire_tag,
 169         vm_map_entry_t  entry,
 170         pmap_t          pmap,
 171         vm_map_offset_t pmap_addr,
 172         ppnum_t         *physpage_p);
 173
 174 static kern_return_t vm_fault_internal(
 175         vm_map_t        map,
 176         vm_map_offset_t vaddr,
 177         vm_prot_t       caller_prot,
 178         boolean_t       change_wiring,
 179         vm_tag_t        wire_tag,
 180         int             interruptible,
 181         pmap_t          pmap,
 182         vm_map_offset_t pmap_addr,
 183         ppnum_t         *physpage_p);
 184
 185 static void vm_fault_copy_cleanup(
 186         vm_page_t       page,
 187         vm_page_t       top_page);
 188
 189 static void vm_fault_copy_dst_cleanup(
 190         vm_page_t       page);
 191
 192 #if     VM_FAULT_CLASSIFY
 193 extern void vm_fault_classify(vm_object_t       object,
 194     vm_object_offset_t    offset,
 195     vm_prot_t             fault_type);
 196
 197 extern void vm_fault_classify_init(void);
 198 #endif
 199
 200 unsigned long vm_pmap_enter_blocked = 0;
 201 unsigned long vm_pmap_enter_retried = 0;
 202
 203 unsigned long vm_cs_validates = 0;
 204 unsigned long vm_cs_revalidates = 0;
 205 unsigned long vm_cs_query_modified = 0;
 206 unsigned long vm_cs_validated_dirtied = 0;
 207 unsigned long vm_cs_bitmap_validated = 0;
 208 #if PMAP_CS
 209 uint64_t vm_cs_defer_to_pmap_cs = 0;
 210 uint64_t vm_cs_defer_to_pmap_cs_not = 0;
 211 #endif /* PMAP_CS */
 212
 213 void vm_pre_fault(vm_map_offset_t, vm_prot_t);
 214
 215 extern char *kdp_compressor_decompressed_page;
 216 extern addr64_t kdp_compressor_decompressed_page_paddr;
 217 extern ppnum_t  kdp_compressor_decompressed_page_ppnum;
 218
 219 struct vmrtfr {
 220         int vmrtfr_maxi;
 221         int vmrtfr_curi;
 222         int64_t vmrtf_total;
 223         vm_rtfault_record_t *vm_rtf_records;
 224 } vmrtfrs;
 225 #define VMRTF_DEFAULT_BUFSIZE (4096)
 226 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
 227 int vmrtf_num_records = VMRTF_NUM_RECORDS_DEFAULT;
 228
 229 static void vm_rtfrecord_lock(void);
 230 static void vm_rtfrecord_unlock(void);
 231 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
 232
 233 lck_spin_t vm_rtfr_slock;
 234 extern lck_grp_t vm_page_lck_grp_bucket;
 235 extern lck_attr_t vm_page_lck_attr;
 236
 237 /*
 238  *      Routine:        vm_fault_init
 239  *      Purpose:
 240  *              Initialize our private data structures.
 241  */
 242 void
 243 vm_fault_init(void)
 244 {
 245         int i, vm_compressor_temp;
 246         boolean_t need_default_val = TRUE;
 247         /*
 248          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 249          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 250          * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
 251          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 252          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 253          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 254          */
 255
 256         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
 257
 258         /*
 259          * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
 260          */
 261
 262         if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
 263                 for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
 264                         if (vm_compressor_temp > 0 &&
 265                             ((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
 266                                 need_default_val = FALSE;
 267                                 vm_compressor_mode = vm_compressor_temp;
 268                                 break;
 269                         }
 270                 }
 271                 if (need_default_val) {
 272                         printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
 273                 }
 274         }
 275         if (need_default_val) {
 276                 /* If no boot arg or incorrect boot arg, try device tree. */
 277                 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
 278         }
 279         printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
 280
 281         PE_parse_boot_argn("vm_protect_privileged_from_untrusted", &vm_protect_privileged_from_untrusted, sizeof(vm_protect_privileged_from_untrusted));
 282 }
 283
 284 void
 285 vm_rtfault_record_init(void)
 286 {
 287         PE_parse_boot_argn("vm_rtfault_records", &vmrtf_num_records, sizeof(vmrtf_num_records));
 288
 289         assert(vmrtf_num_records >= 1);
 290         vmrtf_num_records = MAX(vmrtf_num_records, 1);
 291         size_t kallocsz = vmrtf_num_records * sizeof(vm_rtfault_record_t);
 292         vmrtfrs.vm_rtf_records = kalloc(kallocsz);
 293         bzero(vmrtfrs.vm_rtf_records, kallocsz);
 294         vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
 295         lck_spin_init(&vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
 296 }
 297 /*
 298  *      Routine:        vm_fault_cleanup
 299  *      Purpose:
 300  *              Clean up the result of vm_fault_page.
 301  *      Results:
 302  *              The paging reference for "object" is released.
 303  *              "object" is unlocked.
 304  *              If "top_page" is not null,  "top_page" is
 305  *              freed and the paging reference for the object
 306  *              containing it is released.
 307  *
 308  *      In/out conditions:
 309  *              "object" must be locked.
 310  */
 311 void
 312 vm_fault_cleanup(
 313         vm_object_t     object,
 314         vm_page_t       top_page)
 315 {
 316         vm_object_paging_end(object);
 317         vm_object_unlock(object);
 318
 319         if (top_page != VM_PAGE_NULL) {
 320                 object = VM_PAGE_OBJECT(top_page);
 321
 322                 vm_object_lock(object);
 323                 VM_PAGE_FREE(top_page);
 324                 vm_object_paging_end(object);
 325                 vm_object_unlock(object);
 326         }
 327 }
 328
 329 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 330
 331
 332 boolean_t       vm_page_deactivate_behind = TRUE;
 333 /*
 334  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 335  */
 336 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 337 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 338                                                                 /* we use it to size an array on the stack */
 339
 340 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 341
 342 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 343
 344 /*
 345  * vm_page_is_sequential
 346  *
 347  * Determine if sequential access is in progress
 348  * in accordance with the behavior specified.
 349  * Update state to indicate current access pattern.
 350  *
 351  * object must have at least the shared lock held
 352  */
 353 static
 354 void
 355 vm_fault_is_sequential(
 356         vm_object_t             object,
 357         vm_object_offset_t      offset,
 358         vm_behavior_t           behavior)
 359 {
 360         vm_object_offset_t      last_alloc;
 361         int                     sequential;
 362         int                     orig_sequential;
 363
 364         last_alloc = object->last_alloc;
 365         sequential = object->sequential;
 366         orig_sequential = sequential;
 367
 368         switch (behavior) {
 369         case VM_BEHAVIOR_RANDOM:
 370                 /*
 371                  * reset indicator of sequential behavior
 372                  */
 373                 sequential = 0;
 374                 break;
 375
 376         case VM_BEHAVIOR_SEQUENTIAL:
 377                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 378                         /*
 379                          * advance indicator of sequential behavior
 380                          */
 381                         if (sequential < MAX_SEQUENTIAL_RUN) {
 382                                 sequential += PAGE_SIZE;
 383                         }
 384                 } else {
 385                         /*
 386                          * reset indicator of sequential behavior
 387                          */
 388                         sequential = 0;
 389                 }
 390                 break;
 391
 392         case VM_BEHAVIOR_RSEQNTL:
 393                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 394                         /*
 395                          * advance indicator of sequential behavior
 396                          */
 397                         if (sequential > -MAX_SEQUENTIAL_RUN) {
 398                                 sequential -= PAGE_SIZE;
 399                         }
 400                 } else {
 401                         /*
 402                          * reset indicator of sequential behavior
 403                          */
 404                         sequential = 0;
 405                 }
 406                 break;
 407
 408         case VM_BEHAVIOR_DEFAULT:
 409         default:
 410                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 411                         /*
 412                          * advance indicator of sequential behavior
 413                          */
 414                         if (sequential < 0) {
 415                                 sequential = 0;
 416                         }
 417                         if (sequential < MAX_SEQUENTIAL_RUN) {
 418                                 sequential += PAGE_SIZE;
 419                         }
 420                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 421                         /*
 422                          * advance indicator of sequential behavior
 423                          */
 424                         if (sequential > 0) {
 425                                 sequential = 0;
 426                         }
 427                         if (sequential > -MAX_SEQUENTIAL_RUN) {
 428                                 sequential -= PAGE_SIZE;
 429                         }
 430                 } else {
 431                         /*
 432                          * reset indicator of sequential behavior
 433                          */
 434                         sequential = 0;
 435                 }
 436                 break;
 437         }
 438         if (sequential != orig_sequential) {
 439                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 440                         /*
 441                          * if someone else has already updated object->sequential
 442                          * don't bother trying to update it or object->last_alloc
 443                          */
 444                         return;
 445                 }
 446         }
 447         /*
 448          * I'd like to do this with a OSCompareAndSwap64, but that
 449          * doesn't exist for PPC...  however, it shouldn't matter
 450          * that much... last_alloc is maintained so that we can determine
 451          * if a sequential access pattern is taking place... if only
 452          * one thread is banging on this object, no problem with the unprotected
 453          * update... if 2 or more threads are banging away, we run the risk of
 454          * someone seeing a mangled update... however, in the face of multiple
 455          * accesses, no sequential access pattern can develop anyway, so we
 456          * haven't lost any real info.
 457          */
 458         object->last_alloc = offset;
 459 }
 460
 461
 462 int vm_page_deactivate_behind_count = 0;
 463
 464 /*
 465  * vm_page_deactivate_behind
 466  *
 467  * Determine if sequential access is in progress
 468  * in accordance with the behavior specified.  If
 469  * so, compute a potential page to deactivate and
 470  * deactivate it.
 471  *
 472  * object must be locked.
 473  *
 474  * return TRUE if we actually deactivate a page
 475  */
 476 static
 477 boolean_t
 478 vm_fault_deactivate_behind(
 479         vm_object_t             object,
 480         vm_object_offset_t      offset,
 481         vm_behavior_t           behavior)
 482 {
 483         int             n;
 484         int             pages_in_run = 0;
 485         int             max_pages_in_run = 0;
 486         int             sequential_run;
 487         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 488         vm_object_offset_t      run_offset = 0;
 489         vm_object_offset_t      pg_offset = 0;
 490         vm_page_t       m;
 491         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 492
 493         pages_in_run = 0;
 494 #if TRACEFAULTPAGE
 495         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 496 #endif
 497
 498         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 499                 /*
 500                  * Do not deactivate pages from the kernel object: they
 501                  * are not intended to become pageable.
 502                  * or we've disabled the deactivate behind mechanism
 503                  */
 504                 return FALSE;
 505         }
 506         if ((sequential_run = object->sequential)) {
 507                 if (sequential_run < 0) {
 508                         sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 509                         sequential_run = 0 - sequential_run;
 510                 } else {
 511                         sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 512                 }
 513         }
 514         switch (behavior) {
 515         case VM_BEHAVIOR_RANDOM:
 516                 break;
 517         case VM_BEHAVIOR_SEQUENTIAL:
 518                 if (sequential_run >= (int)PAGE_SIZE) {
 519                         run_offset = 0 - PAGE_SIZE_64;
 520                         max_pages_in_run = 1;
 521                 }
 522                 break;
 523         case VM_BEHAVIOR_RSEQNTL:
 524                 if (sequential_run >= (int)PAGE_SIZE) {
 525                         run_offset = PAGE_SIZE_64;
 526                         max_pages_in_run = 1;
 527                 }
 528                 break;
 529         case VM_BEHAVIOR_DEFAULT:
 530         default:
 531         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 532
 533                 /*
 534                  * determine if the run of sequential accesss has been
 535                  * long enough on an object with default access behavior
 536                  * to consider it for deactivation
 537                  */
 538                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 539                         /*
 540                          * the comparisons between offset and behind are done
 541                          * in this kind of odd fashion in order to prevent wrap around
 542                          * at the end points
 543                          */
 544                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 545                                 if (offset >= behind) {
 546                                         run_offset = 0 - behind;
 547                                         pg_offset = PAGE_SIZE_64;
 548                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 549                                 }
 550                         } else {
 551                                 if (offset < -behind) {
 552                                         run_offset = behind;
 553                                         pg_offset = 0 - PAGE_SIZE_64;
 554                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 555                                 }
 556                         }
 557                 }
 558                 break;}
 559         }
 560         for (n = 0; n < max_pages_in_run; n++) {
 561                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 562
 563                 if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
 564                         page_run[pages_in_run++] = m;
 565
 566                         /*
 567                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
 568                          *
 569                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
 570                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
 571                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
 572                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
 573                          * by pageout_scan, which is just fine since the last reference would have happened quite far
 574                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
 575                          * have happened before we did the deactivate_behind.
 576                          */
 577                         pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
 578                 }
 579         }
 580         if (pages_in_run) {
 581                 vm_page_lockspin_queues();
 582
 583                 for (n = 0; n < pages_in_run; n++) {
 584                         m = page_run[n];
 585
 586                         vm_page_deactivate_internal(m, FALSE);
 587
 588                         vm_page_deactivate_behind_count++;
 589 #if TRACEFAULTPAGE
 590                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 591 #endif
 592                 }
 593                 vm_page_unlock_queues();
 594
 595                 return TRUE;
 596         }
 597         return FALSE;
 598 }
 599
 600
 601 #if (DEVELOPMENT || DEBUG)
 602 uint32_t        vm_page_creation_throttled_hard = 0;
 603 uint32_t        vm_page_creation_throttled_soft = 0;
 604 uint64_t        vm_page_creation_throttle_avoided = 0;
 605 #endif /* DEVELOPMENT || DEBUG */
 606
 607 static int
 608 vm_page_throttled(boolean_t page_kept)
 609 {
 610         clock_sec_t     elapsed_sec;
 611         clock_sec_t     tv_sec;
 612         clock_usec_t    tv_usec;
 613
 614         thread_t thread = current_thread();
 615
 616         if (thread->options & TH_OPT_VMPRIV) {
 617                 return 0;
 618         }
 619
 620         if (thread->t_page_creation_throttled) {
 621                 thread->t_page_creation_throttled = 0;
 622
 623                 if (page_kept == FALSE) {
 624                         goto no_throttle;
 625                 }
 626         }
 627         if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
 628 #if (DEVELOPMENT || DEBUG)
 629                 thread->t_page_creation_throttled_hard++;
 630                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 631 #endif /* DEVELOPMENT || DEBUG */
 632                 return HARD_THROTTLE_DELAY;
 633         }
 634
 635         if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
 636             thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
 637                 if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
 638 #if (DEVELOPMENT || DEBUG)
 639                         OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
 640 #endif
 641                         goto no_throttle;
 642                 }
 643                 clock_get_system_microtime(&tv_sec, &tv_usec);
 644
 645                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 646
 647                 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
 648                     (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
 649                         if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
 650                                 /*
 651                                  * we'll reset our stats to give a well behaved app
 652                                  * that was unlucky enough to accumulate a bunch of pages
 653                                  * over a long period of time a chance to get out of
 654                                  * the throttled state... we reset the counter and timestamp
 655                                  * so that if it stays under the rate limit for the next second
 656                                  * it will be back in our good graces... if it exceeds it, it
 657                                  * will remain in the throttled state
 658                                  */
 659                                 thread->t_page_creation_time = tv_sec;
 660                                 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
 661                         }
 662                         VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
 663
 664                         thread->t_page_creation_throttled = 1;
 665
 666                         if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
 667 #if (DEVELOPMENT || DEBUG)
 668                                 thread->t_page_creation_throttled_hard++;
 669                                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 670 #endif /* DEVELOPMENT || DEBUG */
 671                                 return HARD_THROTTLE_DELAY;
 672                         } else {
 673 #if (DEVELOPMENT || DEBUG)
 674                                 thread->t_page_creation_throttled_soft++;
 675                                 OSAddAtomic(1, &vm_page_creation_throttled_soft);
 676 #endif /* DEVELOPMENT || DEBUG */
 677                                 return SOFT_THROTTLE_DELAY;
 678                         }
 679                 }
 680                 thread->t_page_creation_time = tv_sec;
 681                 thread->t_page_creation_count = 0;
 682         }
 683 no_throttle:
 684         thread->t_page_creation_count++;
 685
 686         return 0;
 687 }
 688
 689
 690 /*
 691  * check for various conditions that would
 692  * prevent us from creating a ZF page...
 693  * cleanup is based on being called from vm_fault_page
 694  *
 695  * object must be locked
 696  * object == m->vmp_object
 697  */
 698 static vm_fault_return_t
 699 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
 700 {
 701         int throttle_delay;
 702
 703         if (object->shadow_severed ||
 704             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 705                 /*
 706                  * Either:
 707                  * 1. the shadow chain was severed,
 708                  * 2. the purgeable object is volatile or empty and is marked
 709                  *    to fault on access while volatile.
 710                  * Just have to return an error at this point
 711                  */
 712                 if (m != VM_PAGE_NULL) {
 713                         VM_PAGE_FREE(m);
 714                 }
 715                 vm_fault_cleanup(object, first_m);
 716
 717                 thread_interrupt_level(interruptible_state);
 718
 719                 return VM_FAULT_MEMORY_ERROR;
 720         }
 721         if (page_throttle == TRUE) {
 722                 if ((throttle_delay = vm_page_throttled(FALSE))) {
 723                         /*
 724                          * we're throttling zero-fills...
 725                          * treat this as if we couldn't grab a page
 726                          */
 727                         if (m != VM_PAGE_NULL) {
 728                                 VM_PAGE_FREE(m);
 729                         }
 730                         vm_fault_cleanup(object, first_m);
 731
 732                         VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 733
 734                         delay(throttle_delay);
 735
 736                         if (current_thread_aborted()) {
 737                                 thread_interrupt_level(interruptible_state);
 738                                 return VM_FAULT_INTERRUPTED;
 739                         }
 740                         thread_interrupt_level(interruptible_state);
 741
 742                         return VM_FAULT_MEMORY_SHORTAGE;
 743                 }
 744         }
 745         return VM_FAULT_SUCCESS;
 746 }
 747
 748
 749 /*
 750  * do the work to zero fill a page and
 751  * inject it into the correct paging queue
 752  *
 753  * m->vmp_object must be locked
 754  * page queue lock must NOT be held
 755  */
 756 static int
 757 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 758 {
 759         int my_fault = DBG_ZERO_FILL_FAULT;
 760         vm_object_t     object;
 761
 762         object = VM_PAGE_OBJECT(m);
 763
 764         /*
 765          * This is is a zero-fill page fault...
 766          *
 767          * Checking the page lock is a waste of
 768          * time;  this page was absent, so
 769          * it can't be page locked by a pager.
 770          *
 771          * we also consider it undefined
 772          * with respect to instruction
 773          * execution.  i.e. it is the responsibility
 774          * of higher layers to call for an instruction
 775          * sync after changing the contents and before
 776          * sending a program into this area.  We
 777          * choose this approach for performance
 778          */
 779         m->vmp_pmapped = TRUE;
 780
 781         m->vmp_cs_validated = FALSE;
 782         m->vmp_cs_tainted = FALSE;
 783         m->vmp_cs_nx = FALSE;
 784
 785         if (no_zero_fill == TRUE) {
 786                 my_fault = DBG_NZF_PAGE_FAULT;
 787
 788                 if (m->vmp_absent && m->vmp_busy) {
 789                         return my_fault;
 790                 }
 791         } else {
 792                 vm_page_zero_fill(m);
 793
 794                 VM_STAT_INCR(zero_fill_count);
 795                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 796         }
 797         assert(!m->vmp_laundry);
 798         assert(object != kernel_object);
 799         //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
 800
 801         if (!VM_DYNAMIC_PAGING_ENABLED() &&
 802             (object->purgable == VM_PURGABLE_DENY ||
 803             object->purgable == VM_PURGABLE_NONVOLATILE ||
 804             object->purgable == VM_PURGABLE_VOLATILE)) {
 805                 vm_page_lockspin_queues();
 806
 807                 if (!VM_DYNAMIC_PAGING_ENABLED()) {
 808                         assert(!VM_PAGE_WIRED(m));
 809
 810                         /*
 811                          * can't be on the pageout queue since we don't
 812                          * have a pager to try and clean to
 813                          */
 814                         vm_page_queues_remove(m, TRUE);
 815                         vm_page_check_pageable_safe(m);
 816                         vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
 817                         m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
 818                         vm_page_throttled_count++;
 819                 }
 820                 vm_page_unlock_queues();
 821         }
 822         return my_fault;
 823 }
 824
 825
 826 /*
 827  *      Routine:        vm_fault_page
 828  *      Purpose:
 829  *              Find the resident page for the virtual memory
 830  *              specified by the given virtual memory object
 831  *              and offset.
 832  *      Additional arguments:
 833  *              The required permissions for the page is given
 834  *              in "fault_type".  Desired permissions are included
 835  *              in "protection".
 836  *              fault_info is passed along to determine pagein cluster
 837  *              limits... it contains the expected reference pattern,
 838  *              cluster size if available, etc...
 839  *
 840  *              If the desired page is known to be resident (for
 841  *              example, because it was previously wired down), asserting
 842  *              the "unwiring" parameter will speed the search.
 843  *
 844  *              If the operation can be interrupted (by thread_abort
 845  *              or thread_terminate), then the "interruptible"
 846  *              parameter should be asserted.
 847  *
 848  *      Results:
 849  *              The page containing the proper data is returned
 850  *              in "result_page".
 851  *
 852  *      In/out conditions:
 853  *              The source object must be locked and referenced,
 854  *              and must donate one paging reference.  The reference
 855  *              is not affected.  The paging reference and lock are
 856  *              consumed.
 857  *
 858  *              If the call succeeds, the object in which "result_page"
 859  *              resides is left locked and holding a paging reference.
 860  *              If this is not the original object, a busy page in the
 861  *              original object is returned in "top_page", to prevent other
 862  *              callers from pursuing this same data, along with a paging
 863  *              reference for the original object.  The "top_page" should
 864  *              be destroyed when this guarantee is no longer required.
 865  *              The "result_page" is also left busy.  It is not removed
 866  *              from the pageout queues.
 867  *      Special Case:
 868  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 869  *              fault succeeded but there's no VM page (i.e. the VM object
 870  *              does not actually hold VM pages, but device memory or
 871  *              large pages).  The object is still locked and we still hold a
 872  *              paging_in_progress reference.
 873  */
 874 unsigned int vm_fault_page_blocked_access = 0;
 875 unsigned int vm_fault_page_forced_retry = 0;
 876
 877 vm_fault_return_t
 878 vm_fault_page(
 879         /* Arguments: */
 880         vm_object_t     first_object,   /* Object to begin search */
 881         vm_object_offset_t first_offset,        /* Offset into object */
 882         vm_prot_t       fault_type,     /* What access is requested */
 883         boolean_t       must_be_resident,/* Must page be resident? */
 884         boolean_t       caller_lookup,  /* caller looked up page */
 885         /* Modifies in place: */
 886         vm_prot_t       *protection,    /* Protection for mapping */
 887         vm_page_t       *result_page,   /* Page found, if successful */
 888         /* Returns: */
 889         vm_page_t       *top_page,      /* Page in top object, if
 890                                          * not result_page.  */
 891         int             *type_of_fault, /* if non-null, fill in with type of fault
 892                                          * COW, zero-fill, etc... returned in trace point */
 893         /* More arguments: */
 894         kern_return_t   *error_code,    /* code if page is in error */
 895         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 896         boolean_t       data_supply,    /* treat as data_supply if
 897                                          * it is a write fault and a full
 898                                          * page is provided */
 899         vm_object_fault_info_t fault_info)
 900 {
 901         vm_page_t               m;
 902         vm_object_t             object;
 903         vm_object_offset_t      offset;
 904         vm_page_t               first_m;
 905         vm_object_t             next_object;
 906         vm_object_t             copy_object;
 907         boolean_t               look_for_page;
 908         boolean_t               force_fault_retry = FALSE;
 909         vm_prot_t               access_required = fault_type;
 910         vm_prot_t               wants_copy_flag;
 911         kern_return_t           wait_result;
 912         wait_interrupt_t        interruptible_state;
 913         boolean_t               data_already_requested = FALSE;
 914         vm_behavior_t           orig_behavior;
 915         vm_size_t               orig_cluster_size;
 916         vm_fault_return_t       error;
 917         int                     my_fault;
 918         uint32_t                try_failed_count;
 919         int                     interruptible; /* how may fault be interrupted? */
 920         int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
 921         memory_object_t         pager;
 922         vm_fault_return_t       retval;
 923         int                     grab_options;
 924
 925 /*
 926  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 927  * marked as paged out in the compressor pager or the pager doesn't exist.
 928  * Note also that if the pager for an internal object
 929  * has not been created, the pager is not invoked regardless of the value
 930  * of MUST_ASK_PAGER().
 931  *
 932  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 933  * is marked as paged out in the compressor pager.
 934  * PAGED_OUT() is used to determine if a page has already been pushed
 935  * into a copy object in order to avoid a redundant page out operation.
 936  */
 937 #define MUST_ASK_PAGER(o, f, s)                                 \
 938         ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
 939
 940 #define PAGED_OUT(o, f) \
 941         (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
 942
 943 /*
 944  *      Recovery actions
 945  */
 946 #define RELEASE_PAGE(m)                                 \
 947         MACRO_BEGIN                                     \
 948         PAGE_WAKEUP_DONE(m);                            \
 949         if ( !VM_PAGE_PAGEABLE(m)) {                    \
 950                 vm_page_lockspin_queues();              \
 951                 if ( !VM_PAGE_PAGEABLE(m)) {            \
 952                         if (VM_CONFIG_COMPRESSOR_IS_ACTIVE)     \
 953                                 vm_page_deactivate(m);          \
 954                         else                                    \
 955                                 vm_page_activate(m);            \
 956                 }                                               \
 957                 vm_page_unlock_queues();                        \
 958         }                                                       \
 959         MACRO_END
 960
 961 #if TRACEFAULTPAGE
 962         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 963 #endif
 964
 965         interruptible = fault_info->interruptible;
 966         interruptible_state = thread_interrupt_level(interruptible);
 967
 968         /*
 969          *      INVARIANTS (through entire routine):
 970          *
 971          *      1)      At all times, we must either have the object
 972          *              lock or a busy page in some object to prevent
 973          *              some other thread from trying to bring in
 974          *              the same page.
 975          *
 976          *              Note that we cannot hold any locks during the
 977          *              pager access or when waiting for memory, so
 978          *              we use a busy page then.
 979          *
 980          *      2)      To prevent another thread from racing us down the
 981          *              shadow chain and entering a new page in the top
 982          *              object before we do, we must keep a busy page in
 983          *              the top object while following the shadow chain.
 984          *
 985          *      3)      We must increment paging_in_progress on any object
 986          *              for which we have a busy page before dropping
 987          *              the object lock
 988          *
 989          *      4)      We leave busy pages on the pageout queues.
 990          *              If the pageout daemon comes across a busy page,
 991          *              it will remove the page from the pageout queues.
 992          */
 993
 994         object = first_object;
 995         offset = first_offset;
 996         first_m = VM_PAGE_NULL;
 997         access_required = fault_type;
 998
 999         /*
1000          * default type of fault
1001          */
1002         my_fault = DBG_CACHE_HIT_FAULT;
1003
1004         while (TRUE) {
1005 #if TRACEFAULTPAGE
1006                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1007 #endif
1008
1009                 grab_options = 0;
1010 #if CONFIG_SECLUDED_MEMORY
1011                 if (object->can_grab_secluded) {
1012                         grab_options |= VM_PAGE_GRAB_SECLUDED;
1013                 }
1014 #endif /* CONFIG_SECLUDED_MEMORY */
1015
1016                 if (!object->alive) {
1017                         /*
1018                          * object is no longer valid
1019                          * clean up and return error
1020                          */
1021                         vm_fault_cleanup(object, first_m);
1022                         thread_interrupt_level(interruptible_state);
1023
1024                         return VM_FAULT_MEMORY_ERROR;
1025                 }
1026
1027                 if (!object->pager_created && object->phys_contiguous) {
1028                         /*
1029                          * A physically-contiguous object without a pager:
1030                          * must be a "large page" object.  We do not deal
1031                          * with VM pages for this object.
1032                          */
1033                         caller_lookup = FALSE;
1034                         m = VM_PAGE_NULL;
1035                         goto phys_contig_object;
1036                 }
1037
1038                 if (object->blocked_access) {
1039                         /*
1040                          * Access to this VM object has been blocked.
1041                          * Replace our "paging_in_progress" reference with
1042                          * a "activity_in_progress" reference and wait for
1043                          * access to be unblocked.
1044                          */
1045                         caller_lookup = FALSE; /* no longer valid after sleep */
1046                         vm_object_activity_begin(object);
1047                         vm_object_paging_end(object);
1048                         while (object->blocked_access) {
1049                                 vm_object_sleep(object,
1050                                     VM_OBJECT_EVENT_UNBLOCKED,
1051                                     THREAD_UNINT);
1052                         }
1053                         vm_fault_page_blocked_access++;
1054                         vm_object_paging_begin(object);
1055                         vm_object_activity_end(object);
1056                 }
1057
1058                 /*
1059                  * See whether the page at 'offset' is resident
1060                  */
1061                 if (caller_lookup == TRUE) {
1062                         /*
1063                          * The caller has already looked up the page
1064                          * and gave us the result in "result_page".
1065                          * We can use this for the first lookup but
1066                          * it loses its validity as soon as we unlock
1067                          * the object.
1068                          */
1069                         m = *result_page;
1070                         caller_lookup = FALSE; /* no longer valid after that */
1071                 } else {
1072                         m = vm_page_lookup(object, offset);
1073                 }
1074 #if TRACEFAULTPAGE
1075                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1076 #endif
1077                 if (m != VM_PAGE_NULL) {
1078                         if (m->vmp_busy) {
1079                                 /*
1080                                  * The page is being brought in,
1081                                  * wait for it and then retry.
1082                                  */
1083 #if TRACEFAULTPAGE
1084                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1085 #endif
1086                                 wait_result = PAGE_SLEEP(object, m, interruptible);
1087
1088                                 counter(c_vm_fault_page_block_busy_kernel++);
1089
1090                                 if (wait_result != THREAD_AWAKENED) {
1091                                         vm_fault_cleanup(object, first_m);
1092                                         thread_interrupt_level(interruptible_state);
1093
1094                                         if (wait_result == THREAD_RESTART) {
1095                                                 return VM_FAULT_RETRY;
1096                                         } else {
1097                                                 return VM_FAULT_INTERRUPTED;
1098                                         }
1099                                 }
1100                                 continue;
1101                         }
1102                         if (m->vmp_laundry) {
1103                                 m->vmp_free_when_done = FALSE;
1104
1105                                 if (!m->vmp_cleaning) {
1106                                         vm_pageout_steal_laundry(m, FALSE);
1107                                 }
1108                         }
1109                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1110                                 /*
1111                                  * Guard page: off limits !
1112                                  */
1113                                 if (fault_type == VM_PROT_NONE) {
1114                                         /*
1115                                          * The fault is not requesting any
1116                                          * access to the guard page, so it must
1117                                          * be just to wire or unwire it.
1118                                          * Let's pretend it succeeded...
1119                                          */
1120                                         m->vmp_busy = TRUE;
1121                                         *result_page = m;
1122                                         assert(first_m == VM_PAGE_NULL);
1123                                         *top_page = first_m;
1124                                         if (type_of_fault) {
1125                                                 *type_of_fault = DBG_GUARD_FAULT;
1126                                         }
1127                                         thread_interrupt_level(interruptible_state);
1128                                         return VM_FAULT_SUCCESS;
1129                                 } else {
1130                                         /*
1131                                          * The fault requests access to the
1132                                          * guard page: let's deny that !
1133                                          */
1134                                         vm_fault_cleanup(object, first_m);
1135                                         thread_interrupt_level(interruptible_state);
1136                                         return VM_FAULT_MEMORY_ERROR;
1137                                 }
1138                         }
1139
1140                         if (m->vmp_error) {
1141                                 /*
1142                                  * The page is in error, give up now.
1143                                  */
1144 #if TRACEFAULTPAGE
1145                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1146 #endif
1147                                 if (error_code) {
1148                                         *error_code = KERN_MEMORY_ERROR;
1149                                 }
1150                                 VM_PAGE_FREE(m);
1151
1152                                 vm_fault_cleanup(object, first_m);
1153                                 thread_interrupt_level(interruptible_state);
1154
1155                                 return VM_FAULT_MEMORY_ERROR;
1156                         }
1157                         if (m->vmp_restart) {
1158                                 /*
1159                                  * The pager wants us to restart
1160                                  * at the top of the chain,
1161                                  * typically because it has moved the
1162                                  * page to another pager, then do so.
1163                                  */
1164 #if TRACEFAULTPAGE
1165                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1166 #endif
1167                                 VM_PAGE_FREE(m);
1168
1169                                 vm_fault_cleanup(object, first_m);
1170                                 thread_interrupt_level(interruptible_state);
1171
1172                                 return VM_FAULT_RETRY;
1173                         }
1174                         if (m->vmp_absent) {
1175                                 /*
1176                                  * The page isn't busy, but is absent,
1177                                  * therefore it's deemed "unavailable".
1178                                  *
1179                                  * Remove the non-existent page (unless it's
1180                                  * in the top object) and move on down to the
1181                                  * next object (if there is one).
1182                                  */
1183 #if TRACEFAULTPAGE
1184                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1185 #endif
1186                                 next_object = object->shadow;
1187
1188                                 if (next_object == VM_OBJECT_NULL) {
1189                                         /*
1190                                          * Absent page at bottom of shadow
1191                                          * chain; zero fill the page we left
1192                                          * busy in the first object, and free
1193                                          * the absent page.
1194                                          */
1195                                         assert(!must_be_resident);
1196
1197                                         /*
1198                                          * check for any conditions that prevent
1199                                          * us from creating a new zero-fill page
1200                                          * vm_fault_check will do all of the
1201                                          * fault cleanup in the case of an error condition
1202                                          * including resetting the thread_interrupt_level
1203                                          */
1204                                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1205
1206                                         if (error != VM_FAULT_SUCCESS) {
1207                                                 return error;
1208                                         }
1209
1210                                         if (object != first_object) {
1211                                                 /*
1212                                                  * free the absent page we just found
1213                                                  */
1214                                                 VM_PAGE_FREE(m);
1215
1216                                                 /*
1217                                                  * drop reference and lock on current object
1218                                                  */
1219                                                 vm_object_paging_end(object);
1220                                                 vm_object_unlock(object);
1221
1222                                                 /*
1223                                                  * grab the original page we
1224                                                  * 'soldered' in place and
1225                                                  * retake lock on 'first_object'
1226                                                  */
1227                                                 m = first_m;
1228                                                 first_m = VM_PAGE_NULL;
1229
1230                                                 object = first_object;
1231                                                 offset = first_offset;
1232
1233                                                 vm_object_lock(object);
1234                                         } else {
1235                                                 /*
1236                                                  * we're going to use the absent page we just found
1237                                                  * so convert it to a 'busy' page
1238                                                  */
1239                                                 m->vmp_absent = FALSE;
1240                                                 m->vmp_busy = TRUE;
1241                                         }
1242                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1243                                                 m->vmp_absent = TRUE;
1244                                         }
1245                                         /*
1246                                          * zero-fill the page and put it on
1247                                          * the correct paging queue
1248                                          */
1249                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1250
1251                                         break;
1252                                 } else {
1253                                         if (must_be_resident) {
1254                                                 vm_object_paging_end(object);
1255                                         } else if (object != first_object) {
1256                                                 vm_object_paging_end(object);
1257                                                 VM_PAGE_FREE(m);
1258                                         } else {
1259                                                 first_m = m;
1260                                                 m->vmp_absent = FALSE;
1261                                                 m->vmp_busy = TRUE;
1262
1263                                                 vm_page_lockspin_queues();
1264                                                 vm_page_queues_remove(m, FALSE);
1265                                                 vm_page_unlock_queues();
1266                                         }
1267
1268                                         offset += object->vo_shadow_offset;
1269                                         fault_info->lo_offset += object->vo_shadow_offset;
1270                                         fault_info->hi_offset += object->vo_shadow_offset;
1271                                         access_required = VM_PROT_READ;
1272
1273                                         vm_object_lock(next_object);
1274                                         vm_object_unlock(object);
1275                                         object = next_object;
1276                                         vm_object_paging_begin(object);
1277
1278                                         /*
1279                                          * reset to default type of fault
1280                                          */
1281                                         my_fault = DBG_CACHE_HIT_FAULT;
1282
1283                                         continue;
1284                                 }
1285                         }
1286                         if ((m->vmp_cleaning)
1287                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1288                             && (fault_type & VM_PROT_WRITE)) {
1289                                 /*
1290                                  * This is a copy-on-write fault that will
1291                                  * cause us to revoke access to this page, but
1292                                  * this page is in the process of being cleaned
1293                                  * in a clustered pageout. We must wait until
1294                                  * the cleaning operation completes before
1295                                  * revoking access to the original page,
1296                                  * otherwise we might attempt to remove a
1297                                  * wired mapping.
1298                                  */
1299 #if TRACEFAULTPAGE
1300                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1301 #endif
1302                                 /*
1303                                  * take an extra ref so that object won't die
1304                                  */
1305                                 vm_object_reference_locked(object);
1306
1307                                 vm_fault_cleanup(object, first_m);
1308
1309                                 counter(c_vm_fault_page_block_backoff_kernel++);
1310                                 vm_object_lock(object);
1311                                 assert(object->ref_count > 0);
1312
1313                                 m = vm_page_lookup(object, offset);
1314
1315                                 if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1316                                         PAGE_ASSERT_WAIT(m, interruptible);
1317
1318                                         vm_object_unlock(object);
1319                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1320                                         vm_object_deallocate(object);
1321
1322                                         goto backoff;
1323                                 } else {
1324                                         vm_object_unlock(object);
1325
1326                                         vm_object_deallocate(object);
1327                                         thread_interrupt_level(interruptible_state);
1328
1329                                         return VM_FAULT_RETRY;
1330                                 }
1331                         }
1332                         if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1333                             !(fault_info != NULL && fault_info->stealth)) {
1334                                 /*
1335                                  * If we were passed a non-NULL pointer for
1336                                  * "type_of_fault", than we came from
1337                                  * vm_fault... we'll let it deal with
1338                                  * this condition, since it
1339                                  * needs to see m->vmp_speculative to correctly
1340                                  * account the pageins, otherwise...
1341                                  * take it off the speculative queue, we'll
1342                                  * let the caller of vm_fault_page deal
1343                                  * with getting it onto the correct queue
1344                                  *
1345                                  * If the caller specified in fault_info that
1346                                  * it wants a "stealth" fault, we also leave
1347                                  * the page in the speculative queue.
1348                                  */
1349                                 vm_page_lockspin_queues();
1350                                 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1351                                         vm_page_queues_remove(m, FALSE);
1352                                 }
1353                                 vm_page_unlock_queues();
1354                         }
1355                         assert(object == VM_PAGE_OBJECT(m));
1356
1357                         if (object->code_signed) {
1358                                 /*
1359                                  * CODE SIGNING:
1360                                  * We just paged in a page from a signed
1361                                  * memory object but we don't need to
1362                                  * validate it now.  We'll validate it if
1363                                  * when it gets mapped into a user address
1364                                  * space for the first time or when the page
1365                                  * gets copied to another object as a result
1366                                  * of a copy-on-write.
1367                                  */
1368                         }
1369
1370                         /*
1371                          * We mark the page busy and leave it on
1372                          * the pageout queues.  If the pageout
1373                          * deamon comes across it, then it will
1374                          * remove the page from the queue, but not the object
1375                          */
1376 #if TRACEFAULTPAGE
1377                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1378 #endif
1379                         assert(!m->vmp_busy);
1380                         assert(!m->vmp_absent);
1381
1382                         m->vmp_busy = TRUE;
1383                         break;
1384                 }
1385
1386
1387                 /*
1388                  * we get here when there is no page present in the object at
1389                  * the offset we're interested in... we'll allocate a page
1390                  * at this point if the pager associated with
1391                  * this object can provide the data or we're the top object...
1392                  * object is locked;  m == NULL
1393                  */
1394
1395                 if (must_be_resident) {
1396                         if (fault_type == VM_PROT_NONE &&
1397                             object == kernel_object) {
1398                                 /*
1399                                  * We've been called from vm_fault_unwire()
1400                                  * while removing a map entry that was allocated
1401                                  * with KMA_KOBJECT and KMA_VAONLY.  This page
1402                                  * is not present and there's nothing more to
1403                                  * do here (nothing to unwire).
1404                                  */
1405                                 vm_fault_cleanup(object, first_m);
1406                                 thread_interrupt_level(interruptible_state);
1407
1408                                 return VM_FAULT_MEMORY_ERROR;
1409                         }
1410
1411                         goto dont_look_for_page;
1412                 }
1413
1414                 /* Don't expect to fault pages into the kernel object. */
1415                 assert(object != kernel_object);
1416
1417                 data_supply = FALSE;
1418
1419                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1420
1421 #if TRACEFAULTPAGE
1422                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1423 #endif
1424                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1425                         /*
1426                          * Allocate a new page for this object/offset pair as a placeholder
1427                          */
1428                         m = vm_page_grab_options(grab_options);
1429 #if TRACEFAULTPAGE
1430                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1431 #endif
1432                         if (m == VM_PAGE_NULL) {
1433                                 vm_fault_cleanup(object, first_m);
1434                                 thread_interrupt_level(interruptible_state);
1435
1436                                 return VM_FAULT_MEMORY_SHORTAGE;
1437                         }
1438
1439                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1440                                 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1441                         } else {
1442                                 vm_page_insert(m, object, offset);
1443                         }
1444                 }
1445                 if (look_for_page) {
1446                         kern_return_t   rc;
1447                         int             my_fault_type;
1448
1449                         /*
1450                          *      If the memory manager is not ready, we
1451                          *      cannot make requests.
1452                          */
1453                         if (!object->pager_ready) {
1454 #if TRACEFAULTPAGE
1455                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1456 #endif
1457                                 if (m != VM_PAGE_NULL) {
1458                                         VM_PAGE_FREE(m);
1459                                 }
1460
1461                                 /*
1462                                  * take an extra ref so object won't die
1463                                  */
1464                                 vm_object_reference_locked(object);
1465                                 vm_fault_cleanup(object, first_m);
1466                                 counter(c_vm_fault_page_block_backoff_kernel++);
1467
1468                                 vm_object_lock(object);
1469                                 assert(object->ref_count > 0);
1470
1471                                 if (!object->pager_ready) {
1472                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1473
1474                                         vm_object_unlock(object);
1475                                         if (wait_result == THREAD_WAITING) {
1476                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1477                                         }
1478                                         vm_object_deallocate(object);
1479
1480                                         goto backoff;
1481                                 } else {
1482                                         vm_object_unlock(object);
1483                                         vm_object_deallocate(object);
1484                                         thread_interrupt_level(interruptible_state);
1485
1486                                         return VM_FAULT_RETRY;
1487                                 }
1488                         }
1489                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1490                                 /*
1491                                  * If there are too many outstanding page
1492                                  * requests pending on this external object, we
1493                                  * wait for them to be resolved now.
1494                                  */
1495 #if TRACEFAULTPAGE
1496                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1497 #endif
1498                                 if (m != VM_PAGE_NULL) {
1499                                         VM_PAGE_FREE(m);
1500                                 }
1501                                 /*
1502                                  * take an extra ref so object won't die
1503                                  */
1504                                 vm_object_reference_locked(object);
1505
1506                                 vm_fault_cleanup(object, first_m);
1507
1508                                 counter(c_vm_fault_page_block_backoff_kernel++);
1509
1510                                 vm_object_lock(object);
1511                                 assert(object->ref_count > 0);
1512
1513                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1514                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1515
1516                                         vm_object_unlock(object);
1517                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1518                                         vm_object_deallocate(object);
1519
1520                                         goto backoff;
1521                                 } else {
1522                                         vm_object_unlock(object);
1523                                         vm_object_deallocate(object);
1524                                         thread_interrupt_level(interruptible_state);
1525
1526                                         return VM_FAULT_RETRY;
1527                                 }
1528                         }
1529                         if (object->internal) {
1530                                 int compressed_count_delta;
1531
1532                                 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1533
1534                                 if (m == VM_PAGE_NULL) {
1535                                         /*
1536                                          * Allocate a new page for this object/offset pair as a placeholder
1537                                          */
1538                                         m = vm_page_grab_options(grab_options);
1539 #if TRACEFAULTPAGE
1540                                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1541 #endif
1542                                         if (m == VM_PAGE_NULL) {
1543                                                 vm_fault_cleanup(object, first_m);
1544                                                 thread_interrupt_level(interruptible_state);
1545
1546                                                 return VM_FAULT_MEMORY_SHORTAGE;
1547                                         }
1548
1549                                         m->vmp_absent = TRUE;
1550                                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1551                                                 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1552                                         } else {
1553                                                 vm_page_insert(m, object, offset);
1554                                         }
1555                                 }
1556                                 assert(m->vmp_busy);
1557
1558                                 m->vmp_absent = TRUE;
1559                                 pager = object->pager;
1560
1561                                 assert(object->paging_in_progress > 0);
1562                                 vm_object_unlock(object);
1563
1564                                 rc = vm_compressor_pager_get(
1565                                         pager,
1566                                         offset + object->paging_offset,
1567                                         VM_PAGE_GET_PHYS_PAGE(m),
1568                                         &my_fault_type,
1569                                         0,
1570                                         &compressed_count_delta);
1571
1572                                 if (type_of_fault == NULL) {
1573                                         int     throttle_delay;
1574
1575                                         /*
1576                                          * we weren't called from vm_fault, so we
1577                                          * need to apply page creation throttling
1578                                          * do it before we re-acquire any locks
1579                                          */
1580                                         if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1581                                                 if ((throttle_delay = vm_page_throttled(TRUE))) {
1582                                                         VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1583                                                         delay(throttle_delay);
1584                                                 }
1585                                         }
1586                                 }
1587                                 vm_object_lock(object);
1588                                 assert(object->paging_in_progress > 0);
1589
1590                                 vm_compressor_pager_count(
1591                                         pager,
1592                                         compressed_count_delta,
1593                                         FALSE, /* shared_lock */
1594                                         object);
1595
1596                                 switch (rc) {
1597                                 case KERN_SUCCESS:
1598                                         m->vmp_absent = FALSE;
1599                                         m->vmp_dirty = TRUE;
1600                                         if ((object->wimg_bits &
1601                                             VM_WIMG_MASK) !=
1602                                             VM_WIMG_USE_DEFAULT) {
1603                                                 /*
1604                                                  * If the page is not cacheable,
1605                                                  * we can't let its contents
1606                                                  * linger in the data cache
1607                                                  * after the decompression.
1608                                                  */
1609                                                 pmap_sync_page_attributes_phys(
1610                                                         VM_PAGE_GET_PHYS_PAGE(m));
1611                                         } else {
1612                                                 m->vmp_written_by_kernel = TRUE;
1613                                         }
1614
1615                                         /*
1616                                          * If the object is purgeable, its
1617                                          * owner's purgeable ledgers have been
1618                                          * updated in vm_page_insert() but the
1619                                          * page was also accounted for in a
1620                                          * "compressed purgeable" ledger, so
1621                                          * update that now.
1622                                          */
1623                                         if (((object->purgable !=
1624                                             VM_PURGABLE_DENY) ||
1625                                             object->vo_ledger_tag) &&
1626                                             (object->vo_owner !=
1627                                             NULL)) {
1628                                                 /*
1629                                                  * One less compressed
1630                                                  * purgeable/tagged page.
1631                                                  */
1632                                                 vm_object_owner_compressed_update(
1633                                                         object,
1634                                                         -1);
1635                                         }
1636
1637                                         break;
1638                                 case KERN_MEMORY_FAILURE:
1639                                         m->vmp_unusual = TRUE;
1640                                         m->vmp_error = TRUE;
1641                                         m->vmp_absent = FALSE;
1642                                         break;
1643                                 case KERN_MEMORY_ERROR:
1644                                         assert(m->vmp_absent);
1645                                         break;
1646                                 default:
1647                                         panic("vm_fault_page(): unexpected "
1648                                             "error %d from "
1649                                             "vm_compressor_pager_get()\n",
1650                                             rc);
1651                                 }
1652                                 PAGE_WAKEUP_DONE(m);
1653
1654                                 rc = KERN_SUCCESS;
1655                                 goto data_requested;
1656                         }
1657                         my_fault_type = DBG_PAGEIN_FAULT;
1658
1659                         if (m != VM_PAGE_NULL) {
1660                                 VM_PAGE_FREE(m);
1661                                 m = VM_PAGE_NULL;
1662                         }
1663
1664 #if TRACEFAULTPAGE
1665                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1666 #endif
1667
1668                         /*
1669                          * It's possible someone called vm_object_destroy while we weren't
1670                          * holding the object lock.  If that has happened, then bail out
1671                          * here.
1672                          */
1673
1674                         pager = object->pager;
1675
1676                         if (pager == MEMORY_OBJECT_NULL) {
1677                                 vm_fault_cleanup(object, first_m);
1678                                 thread_interrupt_level(interruptible_state);
1679                                 return VM_FAULT_MEMORY_ERROR;
1680                         }
1681
1682                         /*
1683                          * We have an absent page in place for the faulting offset,
1684                          * so we can release the object lock.
1685                          */
1686
1687                         if (object->object_is_shared_cache) {
1688                                 set_thread_rwlock_boost();
1689                         }
1690
1691                         vm_object_unlock(object);
1692
1693                         /*
1694                          * If this object uses a copy_call strategy,
1695                          * and we are interested in a copy of this object
1696                          * (having gotten here only by following a
1697                          * shadow chain), then tell the memory manager
1698                          * via a flag added to the desired_access
1699                          * parameter, so that it can detect a race
1700                          * between our walking down the shadow chain
1701                          * and its pushing pages up into a copy of
1702                          * the object that it manages.
1703                          */
1704                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1705                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1706                         } else {
1707                                 wants_copy_flag = VM_PROT_NONE;
1708                         }
1709
1710                         if (object->copy == first_object) {
1711                                 /*
1712                                  * if we issue the memory_object_data_request in
1713                                  * this state, we are subject to a deadlock with
1714                                  * the underlying filesystem if it is trying to
1715                                  * shrink the file resulting in a push of pages
1716                                  * into the copy object...  that push will stall
1717                                  * on the placeholder page, and if the pushing thread
1718                                  * is holding a lock that is required on the pagein
1719                                  * path (such as a truncate lock), we'll deadlock...
1720                                  * to avoid this potential deadlock, we throw away
1721                                  * our placeholder page before calling memory_object_data_request
1722                                  * and force this thread to retry the vm_fault_page after
1723                                  * we have issued the I/O.  the second time through this path
1724                                  * we will find the page already in the cache (presumably still
1725                                  * busy waiting for the I/O to complete) and then complete
1726                                  * the fault w/o having to go through memory_object_data_request again
1727                                  */
1728                                 assert(first_m != VM_PAGE_NULL);
1729                                 assert(VM_PAGE_OBJECT(first_m) == first_object);
1730
1731                                 vm_object_lock(first_object);
1732                                 VM_PAGE_FREE(first_m);
1733                                 vm_object_paging_end(first_object);
1734                                 vm_object_unlock(first_object);
1735
1736                                 first_m = VM_PAGE_NULL;
1737                                 force_fault_retry = TRUE;
1738
1739                                 vm_fault_page_forced_retry++;
1740                         }
1741
1742                         if (data_already_requested == TRUE) {
1743                                 orig_behavior = fault_info->behavior;
1744                                 orig_cluster_size = fault_info->cluster_size;
1745
1746                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1747                                 fault_info->cluster_size = PAGE_SIZE;
1748                         }
1749                         /*
1750                          * Call the memory manager to retrieve the data.
1751                          */
1752                         rc = memory_object_data_request(
1753                                 pager,
1754                                 offset + object->paging_offset,
1755                                 PAGE_SIZE,
1756                                 access_required | wants_copy_flag,
1757                                 (memory_object_fault_info_t)fault_info);
1758
1759                         if (data_already_requested == TRUE) {
1760                                 fault_info->behavior = orig_behavior;
1761                                 fault_info->cluster_size = orig_cluster_size;
1762                         } else {
1763                                 data_already_requested = TRUE;
1764                         }
1765
1766                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1767 #if TRACEFAULTPAGE
1768                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1769 #endif
1770                         vm_object_lock(object);
1771
1772                         if (object->object_is_shared_cache) {
1773                                 clear_thread_rwlock_boost();
1774                         }
1775
1776 data_requested:
1777                         if (rc != KERN_SUCCESS) {
1778                                 vm_fault_cleanup(object, first_m);
1779                                 thread_interrupt_level(interruptible_state);
1780
1781                                 return (rc == MACH_SEND_INTERRUPTED) ?
1782                                        VM_FAULT_INTERRUPTED :
1783                                        VM_FAULT_MEMORY_ERROR;
1784                         } else {
1785                                 clock_sec_t     tv_sec;
1786                                 clock_usec_t    tv_usec;
1787
1788                                 if (my_fault_type == DBG_PAGEIN_FAULT) {
1789                                         clock_get_system_microtime(&tv_sec, &tv_usec);
1790                                         current_thread()->t_page_creation_time = tv_sec;
1791                                         current_thread()->t_page_creation_count = 0;
1792                                 }
1793                         }
1794                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1795                                 vm_fault_cleanup(object, first_m);
1796                                 thread_interrupt_level(interruptible_state);
1797
1798                                 return VM_FAULT_INTERRUPTED;
1799                         }
1800                         if (force_fault_retry == TRUE) {
1801                                 vm_fault_cleanup(object, first_m);
1802                                 thread_interrupt_level(interruptible_state);
1803
1804                                 return VM_FAULT_RETRY;
1805                         }
1806                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1807                                 /*
1808                                  * No page here means that the object we
1809                                  * initially looked up was "physically
1810                                  * contiguous" (i.e. device memory).  However,
1811                                  * with Virtual VRAM, the object might not
1812                                  * be backed by that device memory anymore,
1813                                  * so we're done here only if the object is
1814                                  * still "phys_contiguous".
1815                                  * Otherwise, if the object is no longer
1816                                  * "phys_contiguous", we need to retry the
1817                                  * page fault against the object's new backing
1818                                  * store (different memory object).
1819                                  */
1820 phys_contig_object:
1821                                 goto done;
1822                         }
1823                         /*
1824                          * potentially a pagein fault
1825                          * if we make it through the state checks
1826                          * above, than we'll count it as such
1827                          */
1828                         my_fault = my_fault_type;
1829
1830                         /*
1831                          * Retry with same object/offset, since new data may
1832                          * be in a different page (i.e., m is meaningless at
1833                          * this point).
1834                          */
1835                         continue;
1836                 }
1837 dont_look_for_page:
1838                 /*
1839                  * We get here if the object has no pager, or an existence map
1840                  * exists and indicates the page isn't present on the pager
1841                  * or we're unwiring a page.  If a pager exists, but there
1842                  * is no existence map, then the m->vmp_absent case above handles
1843                  * the ZF case when the pager can't provide the page
1844                  */
1845 #if TRACEFAULTPAGE
1846                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1847 #endif
1848                 if (object == first_object) {
1849                         first_m = m;
1850                 } else {
1851                         assert(m == VM_PAGE_NULL);
1852                 }
1853
1854                 next_object = object->shadow;
1855
1856                 if (next_object == VM_OBJECT_NULL) {
1857                         /*
1858                          * we've hit the bottom of the shadown chain,
1859                          * fill the page in the top object with zeros.
1860                          */
1861                         assert(!must_be_resident);
1862
1863                         if (object != first_object) {
1864                                 vm_object_paging_end(object);
1865                                 vm_object_unlock(object);
1866
1867                                 object = first_object;
1868                                 offset = first_offset;
1869                                 vm_object_lock(object);
1870                         }
1871                         m = first_m;
1872                         assert(VM_PAGE_OBJECT(m) == object);
1873                         first_m = VM_PAGE_NULL;
1874
1875                         /*
1876                          * check for any conditions that prevent
1877                          * us from creating a new zero-fill page
1878                          * vm_fault_check will do all of the
1879                          * fault cleanup in the case of an error condition
1880                          * including resetting the thread_interrupt_level
1881                          */
1882                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1883
1884                         if (error != VM_FAULT_SUCCESS) {
1885                                 return error;
1886                         }
1887
1888                         if (m == VM_PAGE_NULL) {
1889                                 m = vm_page_grab_options(grab_options);
1890
1891                                 if (m == VM_PAGE_NULL) {
1892                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1893                                         thread_interrupt_level(interruptible_state);
1894
1895                                         return VM_FAULT_MEMORY_SHORTAGE;
1896                                 }
1897                                 vm_page_insert(m, object, offset);
1898                         }
1899                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1900                                 m->vmp_absent = TRUE;
1901                         }
1902
1903                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1904
1905                         break;
1906                 } else {
1907                         /*
1908                          * Move on to the next object.  Lock the next
1909                          * object before unlocking the current one.
1910                          */
1911                         if ((object != first_object) || must_be_resident) {
1912                                 vm_object_paging_end(object);
1913                         }
1914
1915                         offset += object->vo_shadow_offset;
1916                         fault_info->lo_offset += object->vo_shadow_offset;
1917                         fault_info->hi_offset += object->vo_shadow_offset;
1918                         access_required = VM_PROT_READ;
1919
1920                         vm_object_lock(next_object);
1921                         vm_object_unlock(object);
1922
1923                         object = next_object;
1924                         vm_object_paging_begin(object);
1925                 }
1926         }
1927
1928         /*
1929          *      PAGE HAS BEEN FOUND.
1930          *
1931          *      This page (m) is:
1932          *              busy, so that we can play with it;
1933          *              not absent, so that nobody else will fill it;
1934          *              possibly eligible for pageout;
1935          *
1936          *      The top-level page (first_m) is:
1937          *              VM_PAGE_NULL if the page was found in the
1938          *               top-level object;
1939          *              busy, not absent, and ineligible for pageout.
1940          *
1941          *      The current object (object) is locked.  A paging
1942          *      reference is held for the current and top-level
1943          *      objects.
1944          */
1945
1946 #if TRACEFAULTPAGE
1947         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1948 #endif
1949 #if     EXTRA_ASSERTIONS
1950         assert(m->vmp_busy && !m->vmp_absent);
1951         assert((first_m == VM_PAGE_NULL) ||
1952             (first_m->vmp_busy && !first_m->vmp_absent &&
1953             !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
1954 #endif  /* EXTRA_ASSERTIONS */
1955
1956         /*
1957          * If the page is being written, but isn't
1958          * already owned by the top-level object,
1959          * we have to copy it into a new page owned
1960          * by the top-level object.
1961          */
1962         if (object != first_object) {
1963 #if TRACEFAULTPAGE
1964                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1965 #endif
1966                 if (fault_type & VM_PROT_WRITE) {
1967                         vm_page_t copy_m;
1968
1969                         /*
1970                          * We only really need to copy if we
1971                          * want to write it.
1972                          */
1973                         assert(!must_be_resident);
1974
1975                         /*
1976                          * If we try to collapse first_object at this
1977                          * point, we may deadlock when we try to get
1978                          * the lock on an intermediate object (since we
1979                          * have the bottom object locked).  We can't
1980                          * unlock the bottom object, because the page
1981                          * we found may move (by collapse) if we do.
1982                          *
1983                          * Instead, we first copy the page.  Then, when
1984                          * we have no more use for the bottom object,
1985                          * we unlock it and try to collapse.
1986                          *
1987                          * Note that we copy the page even if we didn't
1988                          * need to... that's the breaks.
1989                          */
1990
1991                         /*
1992                          * Allocate a page for the copy
1993                          */
1994                         copy_m = vm_page_grab_options(grab_options);
1995
1996                         if (copy_m == VM_PAGE_NULL) {
1997                                 RELEASE_PAGE(m);
1998
1999                                 vm_fault_cleanup(object, first_m);
2000                                 thread_interrupt_level(interruptible_state);
2001
2002                                 return VM_FAULT_MEMORY_SHORTAGE;
2003                         }
2004
2005                         vm_page_copy(m, copy_m);
2006
2007                         /*
2008                          * If another map is truly sharing this
2009                          * page with us, we have to flush all
2010                          * uses of the original page, since we
2011                          * can't distinguish those which want the
2012                          * original from those which need the
2013                          * new copy.
2014                          *
2015                          * XXXO If we know that only one map has
2016                          * access to this page, then we could
2017                          * avoid the pmap_disconnect() call.
2018                          */
2019                         if (m->vmp_pmapped) {
2020                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2021                         }
2022
2023                         if (m->vmp_clustered) {
2024                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2025                                 VM_PAGE_CONSUME_CLUSTERED(m);
2026                         }
2027                         assert(!m->vmp_cleaning);
2028
2029                         /*
2030                          * We no longer need the old page or object.
2031                          */
2032                         RELEASE_PAGE(m);
2033
2034                         /*
2035                          * This check helps with marking the object as having a sequential pattern
2036                          * Normally we'll miss doing this below because this fault is about COW to
2037                          * the first_object i.e. bring page in from disk, push to object above but
2038                          * don't update the file object's sequential pattern.
2039                          */
2040                         if (object->internal == FALSE) {
2041                                 vm_fault_is_sequential(object, offset, fault_info->behavior);
2042                         }
2043
2044                         vm_object_paging_end(object);
2045                         vm_object_unlock(object);
2046
2047                         my_fault = DBG_COW_FAULT;
2048                         VM_STAT_INCR(cow_faults);
2049                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2050                         current_task()->cow_faults++;
2051
2052                         object = first_object;
2053                         offset = first_offset;
2054
2055                         vm_object_lock(object);
2056                         /*
2057                          * get rid of the place holder
2058                          * page that we soldered in earlier
2059                          */
2060                         VM_PAGE_FREE(first_m);
2061                         first_m = VM_PAGE_NULL;
2062
2063                         /*
2064                          * and replace it with the
2065                          * page we just copied into
2066                          */
2067                         assert(copy_m->vmp_busy);
2068                         vm_page_insert(copy_m, object, offset);
2069                         SET_PAGE_DIRTY(copy_m, TRUE);
2070
2071                         m = copy_m;
2072                         /*
2073                          * Now that we've gotten the copy out of the
2074                          * way, let's try to collapse the top object.
2075                          * But we have to play ugly games with
2076                          * paging_in_progress to do that...
2077                          */
2078                         vm_object_paging_end(object);
2079                         vm_object_collapse(object, offset, TRUE);
2080                         vm_object_paging_begin(object);
2081                 } else {
2082                         *protection &= (~VM_PROT_WRITE);
2083                 }
2084         }
2085         /*
2086          * Now check whether the page needs to be pushed into the
2087          * copy object.  The use of asymmetric copy on write for
2088          * shared temporary objects means that we may do two copies to
2089          * satisfy the fault; one above to get the page from a
2090          * shadowed object, and one here to push it into the copy.
2091          */
2092         try_failed_count = 0;
2093
2094         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2095                 vm_object_offset_t      copy_offset;
2096                 vm_page_t               copy_m;
2097
2098 #if TRACEFAULTPAGE
2099                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2100 #endif
2101                 /*
2102                  * If the page is being written, but hasn't been
2103                  * copied to the copy-object, we have to copy it there.
2104                  */
2105                 if ((fault_type & VM_PROT_WRITE) == 0) {
2106                         *protection &= ~VM_PROT_WRITE;
2107                         break;
2108                 }
2109
2110                 /*
2111                  * If the page was guaranteed to be resident,
2112                  * we must have already performed the copy.
2113                  */
2114                 if (must_be_resident) {
2115                         break;
2116                 }
2117
2118                 /*
2119                  * Try to get the lock on the copy_object.
2120                  */
2121                 if (!vm_object_lock_try(copy_object)) {
2122                         vm_object_unlock(object);
2123                         try_failed_count++;
2124
2125                         mutex_pause(try_failed_count);  /* wait a bit */
2126                         vm_object_lock(object);
2127
2128                         continue;
2129                 }
2130                 try_failed_count = 0;
2131
2132                 /*
2133                  * Make another reference to the copy-object,
2134                  * to keep it from disappearing during the
2135                  * copy.
2136                  */
2137                 vm_object_reference_locked(copy_object);
2138
2139                 /*
2140                  * Does the page exist in the copy?
2141                  */
2142                 copy_offset = first_offset - copy_object->vo_shadow_offset;
2143
2144                 if (copy_object->vo_size <= copy_offset) {
2145                         /*
2146                          * Copy object doesn't cover this page -- do nothing.
2147                          */
2148                         ;
2149                 } else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2150                         /*
2151                          * Page currently exists in the copy object
2152                          */
2153                         if (copy_m->vmp_busy) {
2154                                 /*
2155                                  * If the page is being brought
2156                                  * in, wait for it and then retry.
2157                                  */
2158                                 RELEASE_PAGE(m);
2159
2160                                 /*
2161                                  * take an extra ref so object won't die
2162                                  */
2163                                 vm_object_reference_locked(copy_object);
2164                                 vm_object_unlock(copy_object);
2165                                 vm_fault_cleanup(object, first_m);
2166                                 counter(c_vm_fault_page_block_backoff_kernel++);
2167
2168                                 vm_object_lock(copy_object);
2169                                 assert(copy_object->ref_count > 0);
2170                                 VM_OBJ_RES_DECR(copy_object);
2171                                 vm_object_lock_assert_exclusive(copy_object);
2172                                 copy_object->ref_count--;
2173                                 assert(copy_object->ref_count > 0);
2174                                 copy_m = vm_page_lookup(copy_object, copy_offset);
2175
2176                                 if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2177                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
2178
2179                                         vm_object_unlock(copy_object);
2180                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2181                                         vm_object_deallocate(copy_object);
2182
2183                                         goto backoff;
2184                                 } else {
2185                                         vm_object_unlock(copy_object);
2186                                         vm_object_deallocate(copy_object);
2187                                         thread_interrupt_level(interruptible_state);
2188
2189                                         return VM_FAULT_RETRY;
2190                                 }
2191                         }
2192                 } else if (!PAGED_OUT(copy_object, copy_offset)) {
2193                         /*
2194                          * If PAGED_OUT is TRUE, then the page used to exist
2195                          * in the copy-object, and has already been paged out.
2196                          * We don't need to repeat this. If PAGED_OUT is
2197                          * FALSE, then either we don't know (!pager_created,
2198                          * for example) or it hasn't been paged out.
2199                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2200                          * We must copy the page to the copy object.
2201                          *
2202                          * Allocate a page for the copy
2203                          */
2204                         copy_m = vm_page_alloc(copy_object, copy_offset);
2205
2206                         if (copy_m == VM_PAGE_NULL) {
2207                                 RELEASE_PAGE(m);
2208
2209                                 VM_OBJ_RES_DECR(copy_object);
2210                                 vm_object_lock_assert_exclusive(copy_object);
2211                                 copy_object->ref_count--;
2212                                 assert(copy_object->ref_count > 0);
2213
2214                                 vm_object_unlock(copy_object);
2215                                 vm_fault_cleanup(object, first_m);
2216                                 thread_interrupt_level(interruptible_state);
2217
2218                                 return VM_FAULT_MEMORY_SHORTAGE;
2219                         }
2220                         /*
2221                          * Must copy page into copy-object.
2222                          */
2223                         vm_page_copy(m, copy_m);
2224
2225                         /*
2226                          * If the old page was in use by any users
2227                          * of the copy-object, it must be removed
2228                          * from all pmaps.  (We can't know which
2229                          * pmaps use it.)
2230                          */
2231                         if (m->vmp_pmapped) {
2232                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2233                         }
2234
2235                         if (m->vmp_clustered) {
2236                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2237                                 VM_PAGE_CONSUME_CLUSTERED(m);
2238                         }
2239                         /*
2240                          * If there's a pager, then immediately
2241                          * page out this page, using the "initialize"
2242                          * option.  Else, we use the copy.
2243                          */
2244                         if ((!copy_object->pager_ready)
2245                             || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2246                             ) {
2247                                 vm_page_lockspin_queues();
2248                                 assert(!m->vmp_cleaning);
2249                                 vm_page_activate(copy_m);
2250                                 vm_page_unlock_queues();
2251
2252                                 SET_PAGE_DIRTY(copy_m, TRUE);
2253                                 PAGE_WAKEUP_DONE(copy_m);
2254                         } else {
2255                                 assert(copy_m->vmp_busy == TRUE);
2256                                 assert(!m->vmp_cleaning);
2257
2258                                 /*
2259                                  * dirty is protected by the object lock
2260                                  */
2261                                 SET_PAGE_DIRTY(copy_m, TRUE);
2262
2263                                 /*
2264                                  * The page is already ready for pageout:
2265                                  * not on pageout queues and busy.
2266                                  * Unlock everything except the
2267                                  * copy_object itself.
2268                                  */
2269                                 vm_object_unlock(object);
2270
2271                                 /*
2272                                  * Write the page to the copy-object,
2273                                  * flushing it from the kernel.
2274                                  */
2275                                 vm_pageout_initialize_page(copy_m);
2276
2277                                 /*
2278                                  * Since the pageout may have
2279                                  * temporarily dropped the
2280                                  * copy_object's lock, we
2281                                  * check whether we'll have
2282                                  * to deallocate the hard way.
2283                                  */
2284                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2285                                         vm_object_unlock(copy_object);
2286                                         vm_object_deallocate(copy_object);
2287                                         vm_object_lock(object);
2288
2289                                         continue;
2290                                 }
2291                                 /*
2292                                  * Pick back up the old object's
2293                                  * lock.  [It is safe to do so,
2294                                  * since it must be deeper in the
2295                                  * object tree.]
2296                                  */
2297                                 vm_object_lock(object);
2298                         }
2299
2300                         /*
2301                          * Because we're pushing a page upward
2302                          * in the object tree, we must restart
2303                          * any faults that are waiting here.
2304                          * [Note that this is an expansion of
2305                          * PAGE_WAKEUP that uses the THREAD_RESTART
2306                          * wait result].  Can't turn off the page's
2307                          * busy bit because we're not done with it.
2308                          */
2309                         if (m->vmp_wanted) {
2310                                 m->vmp_wanted = FALSE;
2311                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2312                         }
2313                 }
2314                 /*
2315                  * The reference count on copy_object must be
2316                  * at least 2: one for our extra reference,
2317                  * and at least one from the outside world
2318                  * (we checked that when we last locked
2319                  * copy_object).
2320                  */
2321                 vm_object_lock_assert_exclusive(copy_object);
2322                 copy_object->ref_count--;
2323                 assert(copy_object->ref_count > 0);
2324
2325                 VM_OBJ_RES_DECR(copy_object);
2326                 vm_object_unlock(copy_object);
2327
2328                 break;
2329         }
2330
2331 done:
2332         *result_page = m;
2333         *top_page = first_m;
2334
2335         if (m != VM_PAGE_NULL) {
2336                 assert(VM_PAGE_OBJECT(m) == object);
2337
2338                 retval = VM_FAULT_SUCCESS;
2339
2340                 if (my_fault == DBG_PAGEIN_FAULT) {
2341                         VM_PAGE_COUNT_AS_PAGEIN(m);
2342
2343                         if (object->internal) {
2344                                 my_fault = DBG_PAGEIND_FAULT;
2345                         } else {
2346                                 my_fault = DBG_PAGEINV_FAULT;
2347                         }
2348
2349                         /*
2350                          * evaluate access pattern and update state
2351                          * vm_fault_deactivate_behind depends on the
2352                          * state being up to date
2353                          */
2354                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2355                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2356                 } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2357                         /*
2358                          * we weren't called from vm_fault, so handle the
2359                          * accounting here for hits in the cache
2360                          */
2361                         if (m->vmp_clustered) {
2362                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2363                                 VM_PAGE_CONSUME_CLUSTERED(m);
2364                         }
2365                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2366                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2367                 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2368                         VM_STAT_DECOMPRESSIONS();
2369                 }
2370                 if (type_of_fault) {
2371                         *type_of_fault = my_fault;
2372                 }
2373         } else {
2374                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2375                 assert(first_m == VM_PAGE_NULL);
2376                 assert(object == first_object);
2377         }
2378
2379         thread_interrupt_level(interruptible_state);
2380
2381 #if TRACEFAULTPAGE
2382         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2383 #endif
2384         return retval;
2385
2386 backoff:
2387         thread_interrupt_level(interruptible_state);
2388
2389         if (wait_result == THREAD_INTERRUPTED) {
2390                 return VM_FAULT_INTERRUPTED;
2391         }
2392         return VM_FAULT_RETRY;
2393
2394 #undef  RELEASE_PAGE
2395 }
2396
2397
2398
2399 /*
2400  * CODE SIGNING:
2401  * When soft faulting a page, we have to validate the page if:
2402  * 1. the page is being mapped in user space
2403  * 2. the page hasn't already been found to be "tainted"
2404  * 3. the page belongs to a code-signed object
2405  * 4. the page has not been validated yet or has been mapped for write.
2406  */
2407 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page, page_obj)               \
2408         ((pmap) != kernel_pmap /*1*/ &&                                 \
2409          !(page)->vmp_cs_tainted /*2*/ &&                                       \
2410          (page_obj)->code_signed /*3*/ &&                                       \
2411          (!(page)->vmp_cs_validated || (page)->vmp_wpmapped /*4*/ ))
2412
2413
2414 /*
2415  * page queue lock must NOT be held
2416  * m->vmp_object must be locked
2417  *
2418  * NOTE: m->vmp_object could be locked "shared" only if we are called
2419  * from vm_fault() as part of a soft fault.  If so, we must be
2420  * careful not to modify the VM object in any way that is not
2421  * legal under a shared lock...
2422  */
2423 extern int panic_on_cs_killed;
2424 extern int proc_selfpid(void);
2425 extern char *proc_name_address(void *p);
2426 unsigned long cs_enter_tainted_rejected = 0;
2427 unsigned long cs_enter_tainted_accepted = 0;
2428 kern_return_t
2429 vm_fault_enter(vm_page_t m,
2430     pmap_t pmap,
2431     vm_map_offset_t vaddr,
2432     vm_prot_t prot,
2433     vm_prot_t caller_prot,
2434     boolean_t wired,
2435     boolean_t change_wiring,
2436     vm_tag_t  wire_tag,
2437     vm_object_fault_info_t fault_info,
2438     boolean_t *need_retry,
2439     int *type_of_fault)
2440 {
2441         kern_return_t   kr, pe_result;
2442         boolean_t       previously_pmapped = m->vmp_pmapped;
2443         boolean_t       must_disconnect = 0;
2444         boolean_t       map_is_switched, map_is_switch_protected;
2445         boolean_t       cs_violation;
2446         int             cs_enforcement_enabled;
2447         vm_prot_t       fault_type;
2448         vm_object_t     object;
2449         boolean_t       no_cache = fault_info->no_cache;
2450         boolean_t       cs_bypass = fault_info->cs_bypass;
2451         int             pmap_options = fault_info->pmap_options;
2452
2453         fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
2454         object = VM_PAGE_OBJECT(m);
2455
2456         vm_object_lock_assert_held(object);
2457
2458 #if KASAN
2459         if (pmap == kernel_pmap) {
2460                 kasan_notify_address(vaddr, PAGE_SIZE);
2461         }
2462 #endif
2463
2464         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2465
2466         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
2467                 assert(m->vmp_fictitious);
2468                 return KERN_SUCCESS;
2469         }
2470
2471         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2472                 vm_object_lock_assert_exclusive(object);
2473         } else if ((fault_type & VM_PROT_WRITE) == 0 &&
2474             (!m->vmp_wpmapped
2475 #if VM_OBJECT_ACCESS_TRACKING
2476             || object->access_tracking
2477 #endif /* VM_OBJECT_ACCESS_TRACKING */
2478             )) {
2479                 /*
2480                  * This is not a "write" fault, so we
2481                  * might not have taken the object lock
2482                  * exclusively and we might not be able
2483                  * to update the "wpmapped" bit in
2484                  * vm_fault_enter().
2485                  * Let's just grant read access to
2486                  * the page for now and we'll
2487                  * soft-fault again if we need write
2488                  * access later...
2489                  */
2490
2491                 /* This had better not be a JIT page. */
2492                 if (!pmap_has_prot_policy(prot)) {
2493                         prot &= ~VM_PROT_WRITE;
2494                 } else {
2495                         assert(cs_bypass);
2496                 }
2497         }
2498         if (m->vmp_pmapped == FALSE) {
2499                 if (m->vmp_clustered) {
2500                         if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
2501                                 /*
2502                                  * found it in the cache, but this
2503                                  * is the first fault-in of the page (m->vmp_pmapped == FALSE)
2504                                  * so it must have come in as part of
2505                                  * a cluster... account 1 pagein against it
2506                                  */
2507                                 if (object->internal) {
2508                                         *type_of_fault = DBG_PAGEIND_FAULT;
2509                                 } else {
2510                                         *type_of_fault = DBG_PAGEINV_FAULT;
2511                                 }
2512
2513                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2514                         }
2515                         VM_PAGE_CONSUME_CLUSTERED(m);
2516                 }
2517         }
2518
2519         if (*type_of_fault != DBG_COW_FAULT) {
2520                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2521
2522                 if (pmap == kernel_pmap) {
2523                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2524                 }
2525         }
2526
2527         /* Validate code signature if necessary. */
2528         if (!cs_bypass &&
2529             VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) {
2530                 vm_object_lock_assert_exclusive(object);
2531
2532                 if (m->vmp_cs_validated) {
2533                         vm_cs_revalidates++;
2534                 }
2535
2536                 /* VM map is locked, so 1 ref will remain on VM object -
2537                  * so no harm if vm_page_validate_cs drops the object lock */
2538
2539 #if PMAP_CS
2540                 if (fault_info->pmap_cs_associated &&
2541                     pmap_cs_enforced(pmap) &&
2542                     !m->vmp_cs_validated &&
2543                     !m->vmp_cs_tainted &&
2544                     !m->vmp_cs_nx &&
2545                     (prot & VM_PROT_EXECUTE) &&
2546                     (caller_prot & VM_PROT_EXECUTE)) {
2547                         /*
2548                          * With pmap_cs, the pmap layer will validate the
2549                          * code signature for any executable pmap mapping.
2550                          * No need for us to validate this page too:
2551                          * in pmap_cs we trust...
2552                          */
2553                         vm_cs_defer_to_pmap_cs++;
2554                 } else {
2555                         vm_cs_defer_to_pmap_cs_not++;
2556                         vm_page_validate_cs(m);
2557                 }
2558 #else /* PMAP_CS */
2559                 vm_page_validate_cs(m);
2560 #endif /* PMAP_CS */
2561         }
2562
2563 #define page_immutable(m, prot) ((m)->vmp_cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/ )
2564 #define page_nx(m) ((m)->vmp_cs_nx)
2565
2566         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2567             (pmap == vm_map_pmap(current_thread()->map)));
2568         map_is_switch_protected = current_thread()->map->switch_protect;
2569
2570         /* If the map is switched, and is switch-protected, we must protect
2571          * some pages from being write-faulted: immutable pages because by
2572          * definition they may not be written, and executable pages because that
2573          * would provide a way to inject unsigned code.
2574          * If the page is immutable, we can simply return. However, we can't
2575          * immediately determine whether a page is executable anywhere. But,
2576          * we can disconnect it everywhere and remove the executable protection
2577          * from the current map. We do that below right before we do the
2578          * PMAP_ENTER.
2579          */
2580         cs_enforcement_enabled = cs_process_enforcement(NULL);
2581
2582         if (cs_enforcement_enabled && map_is_switched &&
2583             map_is_switch_protected && page_immutable(m, prot) &&
2584             (prot & VM_PROT_WRITE)) {
2585                 return KERN_CODESIGN_ERROR;
2586         }
2587
2588         if (cs_enforcement_enabled && page_nx(m) && (prot & VM_PROT_EXECUTE)) {
2589                 if (cs_debug) {
2590                         printf("page marked to be NX, not letting it be mapped EXEC\n");
2591                 }
2592                 return KERN_CODESIGN_ERROR;
2593         }
2594
2595         /* A page could be tainted, or pose a risk of being tainted later.
2596          * Check whether the receiving process wants it, and make it feel
2597          * the consequences (that hapens in cs_invalid_page()).
2598          * For CS Enforcement, two other conditions will
2599          * cause that page to be tainted as well:
2600          * - pmapping an unsigned page executable - this means unsigned code;
2601          * - writeable mapping of a validated page - the content of that page
2602          *   can be changed without the kernel noticing, therefore unsigned
2603          *   code can be created
2604          */
2605         if (cs_bypass) {
2606                 /* code-signing is bypassed */
2607                 cs_violation = FALSE;
2608         } else if (m->vmp_cs_tainted) {
2609                 /* tainted page */
2610                 cs_violation = TRUE;
2611         } else if (!cs_enforcement_enabled) {
2612                 /* no further code-signing enforcement */
2613                 cs_violation = FALSE;
2614         } else if (page_immutable(m, prot) &&
2615             ((prot & VM_PROT_WRITE) ||
2616             m->vmp_wpmapped)) {
2617                 /*
2618                  * The page should be immutable, but is in danger of being
2619                  * modified.
2620                  * This is the case where we want policy from the code
2621                  * directory - is the page immutable or not? For now we have
2622                  * to assume that code pages will be immutable, data pages not.
2623                  * We'll assume a page is a code page if it has a code directory
2624                  * and we fault for execution.
2625                  * That is good enough since if we faulted the code page for
2626                  * writing in another map before, it is wpmapped; if we fault
2627                  * it for writing in this map later it will also be faulted for
2628                  * executing at the same time; and if we fault for writing in
2629                  * another map later, we will disconnect it from this pmap so
2630                  * we'll notice the change.
2631                  */
2632                 cs_violation = TRUE;
2633         } else if (!m->vmp_cs_validated &&
2634             (prot & VM_PROT_EXECUTE)
2635 #if PMAP_CS
2636             /*
2637              * Executable pages will be validated by pmap_cs;
2638              * in pmap_cs we trust...
2639              * If pmap_cs is turned off, this is a code-signing
2640              * violation.
2641              */
2642             && !(pmap_cs_enforced(pmap))
2643 #endif /* PMAP_CS */
2644             ) {
2645                 cs_violation = TRUE;
2646         } else {
2647                 cs_violation = FALSE;
2648         }
2649
2650         if (cs_violation) {
2651                 /* We will have a tainted page. Have to handle the special case
2652                  * of a switched map now. If the map is not switched, standard
2653                  * procedure applies - call cs_invalid_page().
2654                  * If the map is switched, the real owner is invalid already.
2655                  * There is no point in invalidating the switching process since
2656                  * it will not be executing from the map. So we don't call
2657                  * cs_invalid_page() in that case. */
2658                 boolean_t reject_page, cs_killed;
2659                 if (map_is_switched) {
2660                         assert(pmap == vm_map_pmap(current_thread()->map));
2661                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2662                         reject_page = FALSE;
2663                 } else {
2664                         if (cs_debug > 5) {
2665                                 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2666                                     object->code_signed ? "yes" : "no",
2667                                     m->vmp_cs_validated ? "yes" : "no",
2668                                     m->vmp_cs_tainted ? "yes" : "no",
2669                                     m->vmp_wpmapped ? "yes" : "no",
2670                                     (int)prot);
2671                         }
2672                         reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2673                 }
2674
2675                 if (reject_page) {
2676                         /* reject the invalid page: abort the page fault */
2677                         int                     pid;
2678                         const char              *procname;
2679                         task_t                  task;
2680                         vm_object_t             file_object, shadow;
2681                         vm_object_offset_t      file_offset;
2682                         char                    *pathname, *filename;
2683                         vm_size_t               pathname_len, filename_len;
2684                         boolean_t               truncated_path;
2685 #define __PATH_MAX 1024
2686                         struct timespec         mtime, cs_mtime;
2687                         int                     shadow_depth;
2688                         os_reason_t             codesigning_exit_reason = OS_REASON_NULL;
2689
2690                         kr = KERN_CODESIGN_ERROR;
2691                         cs_enter_tainted_rejected++;
2692
2693                         /* get process name and pid */
2694                         procname = "?";
2695                         task = current_task();
2696                         pid = proc_selfpid();
2697                         if (task->bsd_info != NULL) {
2698                                 procname = proc_name_address(task->bsd_info);
2699                         }
2700
2701                         /* get file's VM object */
2702                         file_object = object;
2703                         file_offset = m->vmp_offset;
2704                         for (shadow = file_object->shadow,
2705                             shadow_depth = 0;
2706                             shadow != VM_OBJECT_NULL;
2707                             shadow = file_object->shadow,
2708                             shadow_depth++) {
2709                                 vm_object_lock_shared(shadow);
2710                                 if (file_object != object) {
2711                                         vm_object_unlock(file_object);
2712                                 }
2713                                 file_offset += file_object->vo_shadow_offset;
2714                                 file_object = shadow;
2715                         }
2716
2717                         mtime.tv_sec = 0;
2718                         mtime.tv_nsec = 0;
2719                         cs_mtime.tv_sec = 0;
2720                         cs_mtime.tv_nsec = 0;
2721
2722                         /* get file's pathname and/or filename */
2723                         pathname = NULL;
2724                         filename = NULL;
2725                         pathname_len = 0;
2726                         filename_len = 0;
2727                         truncated_path = FALSE;
2728                         /* no pager -> no file -> no pathname, use "<nil>" in that case */
2729                         if (file_object->pager != NULL) {
2730                                 pathname = (char *)kalloc(__PATH_MAX * 2);
2731                                 if (pathname) {
2732                                         pathname[0] = '\0';
2733                                         pathname_len = __PATH_MAX;
2734                                         filename = pathname + pathname_len;
2735                                         filename_len = __PATH_MAX;
2736
2737                                         if (vnode_pager_get_object_name(file_object->pager,
2738                                             pathname,
2739                                             pathname_len,
2740                                             filename,
2741                                             filename_len,
2742                                             &truncated_path) == KERN_SUCCESS) {
2743                                                 /* safety first... */
2744                                                 pathname[__PATH_MAX - 1] = '\0';
2745                                                 filename[__PATH_MAX - 1] = '\0';
2746
2747                                                 vnode_pager_get_object_mtime(file_object->pager,
2748                                                     &mtime,
2749                                                     &cs_mtime);
2750                                         } else {
2751                                                 kfree(pathname, __PATH_MAX * 2);
2752                                                 pathname = NULL;
2753                                                 filename = NULL;
2754                                                 pathname_len = 0;
2755                                                 filename_len = 0;
2756                                                 truncated_path = FALSE;
2757                                         }
2758                                 }
2759                         }
2760                         printf("CODE SIGNING: process %d[%s]: "
2761                             "rejecting invalid page at address 0x%llx "
2762                             "from offset 0x%llx in file \"%s%s%s\" "
2763                             "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2764                             "(signed:%d validated:%d tainted:%d nx:%d "
2765                             "wpmapped:%d dirty:%d depth:%d)\n",
2766                             pid, procname, (addr64_t) vaddr,
2767                             file_offset,
2768                             (pathname ? pathname : "<nil>"),
2769                             (truncated_path ? "/.../" : ""),
2770                             (truncated_path ? filename : ""),
2771                             cs_mtime.tv_sec, cs_mtime.tv_nsec,
2772                             ((cs_mtime.tv_sec == mtime.tv_sec &&
2773                             cs_mtime.tv_nsec == mtime.tv_nsec)
2774                             ? "=="
2775                             : "!="),
2776                             mtime.tv_sec, mtime.tv_nsec,
2777                             object->code_signed,
2778                             m->vmp_cs_validated,
2779                             m->vmp_cs_tainted,
2780                             m->vmp_cs_nx,
2781                             m->vmp_wpmapped,
2782                             m->vmp_dirty,
2783                             shadow_depth);
2784
2785                         /*
2786                          * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2787                          * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2788                          * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2789                          * will deal with the segmentation fault.
2790                          */
2791                         if (cs_killed) {
2792                                 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2793                                     pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
2794
2795                                 codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2796                                 if (codesigning_exit_reason == NULL) {
2797                                         printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2798                                 } else {
2799                                         mach_vm_address_t data_addr = 0;
2800                                         struct codesigning_exit_reason_info *ceri = NULL;
2801                                         uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2802
2803                                         if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2804                                                 printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2805                                         } else {
2806                                                 if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2807                                                     EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2808                                                         ceri = (struct codesigning_exit_reason_info *)data_addr;
2809                                                         static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2810
2811                                                         ceri->ceri_virt_addr = vaddr;
2812                                                         ceri->ceri_file_offset = file_offset;
2813                                                         if (pathname) {
2814                                                                 strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2815                                                         } else {
2816                                                                 ceri->ceri_pathname[0] = '\0';
2817                                                         }
2818                                                         if (filename) {
2819                                                                 strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2820                                                         } else {
2821                                                                 ceri->ceri_filename[0] = '\0';
2822                                                         }
2823                                                         ceri->ceri_path_truncated = (truncated_path);
2824                                                         ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2825                                                         ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2826                                                         ceri->ceri_page_modtime_secs = mtime.tv_sec;
2827                                                         ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2828                                                         ceri->ceri_object_codesigned = (object->code_signed);
2829                                                         ceri->ceri_page_codesig_validated = (m->vmp_cs_validated);
2830                                                         ceri->ceri_page_codesig_tainted = (m->vmp_cs_tainted);
2831                                                         ceri->ceri_page_codesig_nx = (m->vmp_cs_nx);
2832                                                         ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2833                                                         ceri->ceri_page_slid = 0;
2834                                                         ceri->ceri_page_dirty = (m->vmp_dirty);
2835                                                         ceri->ceri_page_shadow_depth = shadow_depth;
2836                                                 } else {
2837 #if DEBUG || DEVELOPMENT
2838                                                         panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2839 #else
2840                                                         printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2841 #endif /* DEBUG || DEVELOPMENT */
2842                                                         /* Free the buffer */
2843                                                         os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2844                                                 }
2845                                         }
2846                                 }
2847
2848                                 set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2849                         }
2850                         if (panic_on_cs_killed &&
2851                             object->object_is_shared_cache) {
2852                                 char *tainted_contents;
2853                                 vm_map_offset_t src_vaddr;
2854                                 src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
2855                                 tainted_contents = kalloc(PAGE_SIZE);
2856                                 bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
2857                                 printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
2858                                 panic("CODE SIGNING: process %d[%s]: "
2859                                     "rejecting invalid page (phys#0x%x) at address 0x%llx "
2860                                     "from offset 0x%llx in file \"%s%s%s\" "
2861                                     "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2862                                     "(signed:%d validated:%d tainted:%d nx:%d"
2863                                     "wpmapped:%d dirty:%d depth:%d)\n",
2864                                     pid, procname,
2865                                     VM_PAGE_GET_PHYS_PAGE(m),
2866                                     (addr64_t) vaddr,
2867                                     file_offset,
2868                                     (pathname ? pathname : "<nil>"),
2869                                     (truncated_path ? "/.../" : ""),
2870                                     (truncated_path ? filename : ""),
2871                                     cs_mtime.tv_sec, cs_mtime.tv_nsec,
2872                                     ((cs_mtime.tv_sec == mtime.tv_sec &&
2873                                     cs_mtime.tv_nsec == mtime.tv_nsec)
2874                                     ? "=="
2875                                     : "!="),
2876                                     mtime.tv_sec, mtime.tv_nsec,
2877                                     object->code_signed,
2878                                     m->vmp_cs_validated,
2879                                     m->vmp_cs_tainted,
2880                                     m->vmp_cs_nx,
2881                                     m->vmp_wpmapped,
2882                                     m->vmp_dirty,
2883                                     shadow_depth);
2884                         }
2885
2886                         if (file_object != object) {
2887                                 vm_object_unlock(file_object);
2888                         }
2889                         if (pathname_len != 0) {
2890                                 kfree(pathname, __PATH_MAX * 2);
2891                                 pathname = NULL;
2892                                 filename = NULL;
2893                         }
2894                 } else {
2895                         /* proceed with the invalid page */
2896                         kr = KERN_SUCCESS;
2897                         if (!m->vmp_cs_validated &&
2898                             !object->code_signed) {
2899                                 /*
2900                                  * This page has not been (fully) validated but
2901                                  * does not belong to a code-signed object
2902                                  * so it should not be forcefully considered
2903                                  * as tainted.
2904                                  * We're just concerned about it here because
2905                                  * we've been asked to "execute" it but that
2906                                  * does not mean that it should cause other
2907                                  * accesses to fail.
2908                                  * This happens when a debugger sets a
2909                                  * breakpoint and we then execute code in
2910                                  * that page.  Marking the page as "tainted"
2911                                  * would cause any inspection tool ("leaks",
2912                                  * "vmmap", "CrashReporter", ...) to get killed
2913                                  * due to code-signing violation on that page,
2914                                  * even though they're just reading it and not
2915                                  * executing from it.
2916                                  */
2917                         } else {
2918                                 /*
2919                                  * Page might have been tainted before or not;
2920                                  * now it definitively is. If the page wasn't
2921                                  * tainted, we must disconnect it from all
2922                                  * pmaps later, to force existing mappings
2923                                  * through that code path for re-consideration
2924                                  * of the validity of that page.
2925                                  */
2926                                 must_disconnect = !m->vmp_cs_tainted;
2927                                 m->vmp_cs_tainted = TRUE;
2928                         }
2929                         cs_enter_tainted_accepted++;
2930                 }
2931                 if (kr != KERN_SUCCESS) {
2932                         if (cs_debug) {
2933                                 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2934                                     "*** INVALID PAGE ***\n",
2935                                     (long long)vaddr);
2936                         }
2937 #if !SECURE_KERNEL
2938                         if (cs_enforcement_panic) {
2939                                 panic("CODESIGNING: panicking on invalid page\n");
2940                         }
2941 #endif
2942                 }
2943         } else {
2944                 /* proceed with the valid page */
2945                 kr = KERN_SUCCESS;
2946         }
2947
2948         boolean_t       page_queues_locked = FALSE;
2949 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
2950 MACRO_BEGIN                                     \
2951         if (! page_queues_locked) {             \
2952                 page_queues_locked = TRUE;      \
2953                 vm_page_lockspin_queues();      \
2954         }                                       \
2955 MACRO_END
2956 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
2957 MACRO_BEGIN                                     \
2958         if (page_queues_locked) {               \
2959                 page_queues_locked = FALSE;     \
2960                 vm_page_unlock_queues();        \
2961         }                                       \
2962 MACRO_END
2963
2964         /*
2965          * Hold queues lock to manipulate
2966          * the page queues.  Change wiring
2967          * case is obvious.
2968          */
2969         assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
2970
2971 #if CONFIG_BACKGROUND_QUEUE
2972         vm_page_update_background_state(m);
2973 #endif
2974         if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2975                 /*
2976                  * Compressor pages are neither wired
2977                  * nor pageable and should never change.
2978                  */
2979                 assert(object == compressor_object);
2980         } else if (change_wiring) {
2981                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2982
2983                 if (wired) {
2984                         if (kr == KERN_SUCCESS) {
2985                                 vm_page_wire(m, wire_tag, TRUE);
2986                         }
2987                 } else {
2988                         vm_page_unwire(m, TRUE);
2989                 }
2990                 /* we keep the page queues lock, if we need it later */
2991         } else {
2992                 if (object->internal == TRUE) {
2993                         /*
2994                          * don't allow anonymous pages on
2995                          * the speculative queues
2996                          */
2997                         no_cache = FALSE;
2998                 }
2999                 if (kr != KERN_SUCCESS) {
3000                         __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3001                         vm_page_deactivate(m);
3002                         /* we keep the page queues lock, if we need it later */
3003                 } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3004                     (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3005                     (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3006                     ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3007                     !VM_PAGE_WIRED(m)) {
3008                         if (vm_page_local_q &&
3009                             (*type_of_fault == DBG_COW_FAULT ||
3010                             *type_of_fault == DBG_ZERO_FILL_FAULT)) {
3011                                 struct vpl      *lq;
3012                                 uint32_t        lid;
3013
3014                                 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3015
3016                                 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3017                                 vm_object_lock_assert_exclusive(object);
3018
3019                                 /*
3020                                  * we got a local queue to stuff this
3021                                  * new page on...
3022                                  * its safe to manipulate local and
3023                                  * local_id at this point since we're
3024                                  * behind an exclusive object lock and
3025                                  * the page is not on any global queue.
3026                                  *
3027                                  * we'll use the current cpu number to
3028                                  * select the queue note that we don't
3029                                  * need to disable preemption... we're
3030                                  * going to be behind the local queue's
3031                                  * lock to do the real work
3032                                  */
3033                                 lid = cpu_number();
3034
3035                                 lq = &vm_page_local_q[lid].vpl_un.vpl;
3036
3037                                 VPL_LOCK(&lq->vpl_lock);
3038
3039                                 vm_page_check_pageable_safe(m);
3040                                 vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3041                                 m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3042                                 m->vmp_local_id = lid;
3043                                 lq->vpl_count++;
3044
3045                                 if (object->internal) {
3046                                         lq->vpl_internal_count++;
3047                                 } else {
3048                                         lq->vpl_external_count++;
3049                                 }
3050
3051                                 VPL_UNLOCK(&lq->vpl_lock);
3052
3053                                 if (lq->vpl_count > vm_page_local_q_soft_limit) {
3054                                         /*
3055                                          * we're beyond the soft limit
3056                                          * for the local queue
3057                                          * vm_page_reactivate_local will
3058                                          * 'try' to take the global page
3059                                          * queue lock... if it can't
3060                                          * that's ok... we'll let the
3061                                          * queue continue to grow up
3062                                          * to the hard limit... at that
3063                                          * point we'll wait for the
3064                                          * lock... once we've got the
3065                                          * lock, we'll transfer all of
3066                                          * the pages from the local
3067                                          * queue to the global active
3068                                          * queue
3069                                          */
3070                                         vm_page_reactivate_local(lid, FALSE, FALSE);
3071                                 }
3072                         } else {
3073                                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3074
3075                                 /*
3076                                  * test again now that we hold the
3077                                  * page queue lock
3078                                  */
3079                                 if (!VM_PAGE_WIRED(m)) {
3080                                         if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3081                                                 vm_page_queues_remove(m, FALSE);
3082
3083                                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3084                                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3085                                         }
3086
3087                                         if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3088                                             no_cache) {
3089                                                 /*
3090                                                  * If this is a no_cache mapping
3091                                                  * and the page has never been
3092                                                  * mapped before or was
3093                                                  * previously a no_cache page,
3094                                                  * then we want to leave pages
3095                                                  * in the speculative state so
3096                                                  * that they can be readily
3097                                                  * recycled if free memory runs
3098                                                  * low.  Otherwise the page is
3099                                                  * activated as normal.
3100                                                  */
3101
3102                                                 if (no_cache &&
3103                                                     (!previously_pmapped ||
3104                                                     m->vmp_no_cache)) {
3105                                                         m->vmp_no_cache = TRUE;
3106
3107                                                         if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3108                                                                 vm_page_speculate(m, FALSE);
3109                                                         }
3110                                                 } else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3111                                                         vm_page_activate(m);
3112                                                 }
3113                                         }
3114                                 }
3115                                 /* we keep the page queues lock, if we need it later */
3116                         }
3117                 }
3118         }
3119         /* we're done with the page queues lock, if we ever took it */
3120         __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3121
3122
3123         /* If we have a KERN_SUCCESS from the previous checks, we either have
3124          * a good page, or a tainted page that has been accepted by the process.
3125          * In both cases the page will be entered into the pmap.
3126          * If the page is writeable, we need to disconnect it from other pmaps
3127          * now so those processes can take note.
3128          */
3129         if (kr == KERN_SUCCESS) {
3130                 /*
3131                  * NOTE: we may only hold the vm_object lock SHARED
3132                  * at this point, so we need the phys_page lock to
3133                  * properly serialize updating the pmapped and
3134                  * xpmapped bits
3135                  */
3136                 if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3137                         ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3138
3139                         pmap_lock_phys_page(phys_page);
3140                         /*
3141                          * go ahead and take the opportunity
3142                          * to set 'pmapped' here so that we don't
3143                          * need to grab this lock a 2nd time
3144                          * just below
3145                          */
3146                         m->vmp_pmapped = TRUE;
3147
3148                         if (!m->vmp_xpmapped) {
3149                                 m->vmp_xpmapped = TRUE;
3150
3151                                 pmap_unlock_phys_page(phys_page);
3152
3153                                 if (!object->internal) {
3154                                         OSAddAtomic(1, &vm_page_xpmapped_external_count);
3155                                 }
3156
3157 #if defined(__arm__) || defined(__arm64__)
3158                                 pmap_sync_page_data_phys(phys_page);
3159 #else
3160                                 if (object->internal &&
3161                                     object->pager != NULL) {
3162                                         /*
3163                                          * This page could have been
3164                                          * uncompressed by the
3165                                          * compressor pager and its
3166                                          * contents might be only in
3167                                          * the data cache.
3168                                          * Since it's being mapped for
3169                                          * "execute" for the fist time,
3170                                          * make sure the icache is in
3171                                          * sync.
3172                                          */
3173                                         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3174                                         pmap_sync_page_data_phys(phys_page);
3175                                 }
3176 #endif
3177                         } else {
3178                                 pmap_unlock_phys_page(phys_page);
3179                         }
3180                 } else {
3181                         if (m->vmp_pmapped == FALSE) {
3182                                 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3183
3184                                 pmap_lock_phys_page(phys_page);
3185                                 m->vmp_pmapped = TRUE;
3186                                 pmap_unlock_phys_page(phys_page);
3187                         }
3188                 }
3189
3190                 if (fault_type & VM_PROT_WRITE) {
3191                         if (m->vmp_wpmapped == FALSE) {
3192                                 vm_object_lock_assert_exclusive(object);
3193                                 if (!object->internal && object->pager) {
3194                                         task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3195                                 }
3196                                 m->vmp_wpmapped = TRUE;
3197                         }
3198                         if (must_disconnect) {
3199                                 /*
3200                                  * We can only get here
3201                                  * because of the CSE logic
3202                                  */
3203                                 assert(cs_enforcement_enabled);
3204                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3205                                 /*
3206                                  * If we are faulting for a write, we can clear
3207                                  * the execute bit - that will ensure the page is
3208                                  * checked again before being executable, which
3209                                  * protects against a map switch.
3210                                  * This only happens the first time the page
3211                                  * gets tainted, so we won't get stuck here
3212                                  * to make an already writeable page executable.
3213                                  */
3214                                 if (!cs_bypass) {
3215                                         assert(!pmap_has_prot_policy(prot));
3216                                         prot &= ~VM_PROT_EXECUTE;
3217                                 }
3218                         }
3219                 }
3220                 assert(VM_PAGE_OBJECT(m) == object);
3221
3222 #if VM_OBJECT_ACCESS_TRACKING
3223                 if (object->access_tracking) {
3224                         DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3225                         if (fault_type & VM_PROT_WRITE) {
3226                                 object->access_tracking_writes++;
3227                                 vm_object_access_tracking_writes++;
3228                         } else {
3229                                 object->access_tracking_reads++;
3230                                 vm_object_access_tracking_reads++;
3231                         }
3232                 }
3233 #endif /* VM_OBJECT_ACCESS_TRACKING */
3234
3235
3236 #if PMAP_CS
3237 pmap_enter_retry:
3238 #endif
3239                 /* Prevent a deadlock by not
3240                  * holding the object lock if we need to wait for a page in
3241                  * pmap_enter() - <rdar://problem/7138958> */
3242                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3243                     wired,
3244                     pmap_options | PMAP_OPTIONS_NOWAIT,
3245                     pe_result);
3246 #if PMAP_CS
3247                 /*
3248                  * Retry without execute permission if we encountered a codesigning
3249                  * failure on a non-execute fault.  This allows applications which
3250                  * don't actually need to execute code to still map it for read access.
3251                  */
3252                 if ((pe_result == KERN_CODESIGN_ERROR) && pmap_cs_enforced(pmap) &&
3253                     (prot & VM_PROT_EXECUTE) && !(caller_prot & VM_PROT_EXECUTE)) {
3254                         prot &= ~VM_PROT_EXECUTE;
3255                         goto pmap_enter_retry;
3256                 }
3257 #endif
3258 #if __x86_64__
3259                 if (pe_result == KERN_INVALID_ARGUMENT &&
3260                     pmap == PMAP_NULL &&
3261                     wired) {
3262                         /*
3263                          * Wiring a page in a pmap-less VM map:
3264                          * VMware's "vmmon" kernel extension does this
3265                          * to grab pages.
3266                          * Let it proceed even though the PMAP_ENTER() failed.
3267                          */
3268                         pe_result = KERN_SUCCESS;
3269                 }
3270 #endif /* __x86_64__ */
3271
3272                 if (pe_result == KERN_RESOURCE_SHORTAGE) {
3273                         if (need_retry) {
3274                                 /*
3275                                  * this will be non-null in the case where we hold the lock
3276                                  * on the top-object in this chain... we can't just drop
3277                                  * the lock on the object we're inserting the page into
3278                                  * and recall the PMAP_ENTER since we can still cause
3279                                  * a deadlock if one of the critical paths tries to
3280                                  * acquire the lock on the top-object and we're blocked
3281                                  * in PMAP_ENTER waiting for memory... our only recourse
3282                                  * is to deal with it at a higher level where we can
3283                                  * drop both locks.
3284                                  */
3285                                 *need_retry = TRUE;
3286                                 vm_pmap_enter_retried++;
3287                                 goto after_the_pmap_enter;
3288                         }
3289                         /* The nonblocking version of pmap_enter did not succeed.
3290                          * and we don't need to drop other locks and retry
3291                          * at the level above us, so
3292                          * use the blocking version instead. Requires marking
3293                          * the page busy and unlocking the object */
3294                         boolean_t was_busy = m->vmp_busy;
3295
3296                         vm_object_lock_assert_exclusive(object);
3297
3298                         m->vmp_busy = TRUE;
3299                         vm_object_unlock(object);
3300
3301                         PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
3302                             0, wired,
3303                             pmap_options, pe_result);
3304
3305                         assert(VM_PAGE_OBJECT(m) == object);
3306
3307                         /* Take the object lock again. */
3308                         vm_object_lock(object);
3309
3310                         /* If the page was busy, someone else will wake it up.
3311                          * Otherwise, we have to do it now. */
3312                         assert(m->vmp_busy);
3313                         if (!was_busy) {
3314                                 PAGE_WAKEUP_DONE(m);
3315                         }
3316                         vm_pmap_enter_blocked++;
3317                 }
3318
3319                 kr = pe_result;
3320         }
3321
3322 after_the_pmap_enter:
3323         return kr;
3324 }
3325
3326 void
3327 vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
3328 {
3329         if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3330                 vm_fault(current_map(),      /* map */
3331                     vaddr,                   /* vaddr */
3332                     prot,                    /* fault_type */
3333                     FALSE,                   /* change_wiring */
3334                     VM_KERN_MEMORY_NONE,     /* tag - not wiring */
3335                     THREAD_UNINT,            /* interruptible */
3336                     NULL,                    /* caller_pmap */
3337                     0 /* caller_pmap_addr */);
3338         }
3339 }
3340
3341
3342 /*
3343  *      Routine:        vm_fault
3344  *      Purpose:
3345  *              Handle page faults, including pseudo-faults
3346  *              used to change the wiring status of pages.
3347  *      Returns:
3348  *              Explicit continuations have been removed.
3349  *      Implementation:
3350  *              vm_fault and vm_fault_page save mucho state
3351  *              in the moral equivalent of a closure.  The state
3352  *              structure is allocated when first entering vm_fault
3353  *              and deallocated when leaving vm_fault.
3354  */
3355
3356 extern int _map_enter_debug;
3357 extern uint64_t get_current_unique_pid(void);
3358
3359 unsigned long vm_fault_collapse_total = 0;
3360 unsigned long vm_fault_collapse_skipped = 0;
3361
3362
3363 kern_return_t
3364 vm_fault_external(
3365         vm_map_t        map,
3366         vm_map_offset_t vaddr,
3367         vm_prot_t       fault_type,
3368         boolean_t       change_wiring,
3369         int             interruptible,
3370         pmap_t          caller_pmap,
3371         vm_map_offset_t caller_pmap_addr)
3372 {
3373         return vm_fault_internal(map, vaddr, fault_type, change_wiring, vm_tag_bt(),
3374                    interruptible, caller_pmap, caller_pmap_addr,
3375                    NULL);
3376 }
3377
3378 kern_return_t
3379 vm_fault(
3380         vm_map_t        map,
3381         vm_map_offset_t vaddr,
3382         vm_prot_t       fault_type,
3383         boolean_t       change_wiring,
3384         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3385         int             interruptible,
3386         pmap_t          caller_pmap,
3387         vm_map_offset_t caller_pmap_addr)
3388 {
3389         return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3390                    interruptible, caller_pmap, caller_pmap_addr,
3391                    NULL);
3392 }
3393
3394 static boolean_t
3395 current_proc_is_privileged(void)
3396 {
3397         return csproc_get_platform_binary(current_proc());
3398 }
3399
3400 uint64_t vm_copied_on_read = 0;
3401
3402 kern_return_t
3403 vm_fault_internal(
3404         vm_map_t        map,
3405         vm_map_offset_t vaddr,
3406         vm_prot_t       caller_prot,
3407         boolean_t       change_wiring,
3408         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3409         int             interruptible,
3410         pmap_t          caller_pmap,
3411         vm_map_offset_t caller_pmap_addr,
3412         ppnum_t         *physpage_p)
3413 {
3414         vm_map_version_t        version;        /* Map version for verificiation */
3415         boolean_t               wired;          /* Should mapping be wired down? */
3416         vm_object_t             object;         /* Top-level object */
3417         vm_object_offset_t      offset;         /* Top-level offset */
3418         vm_prot_t               prot;           /* Protection for mapping */
3419         vm_object_t             old_copy_object; /* Saved copy object */
3420         vm_page_t               result_page;    /* Result of vm_fault_page */
3421         vm_page_t               top_page;       /* Placeholder page */
3422         kern_return_t           kr;
3423
3424         vm_page_t               m;      /* Fast access to result_page */
3425         kern_return_t           error_code;
3426         vm_object_t             cur_object;
3427         vm_object_t             m_object = NULL;
3428         vm_object_offset_t      cur_offset;
3429         vm_page_t               cur_m;
3430         vm_object_t             new_object;
3431         int                     type_of_fault;
3432         pmap_t                  pmap;
3433         wait_interrupt_t        interruptible_state;
3434         vm_map_t                real_map = map;
3435         vm_map_t                original_map = map;
3436         boolean_t               object_locks_dropped = FALSE;
3437         vm_prot_t               fault_type;
3438         vm_prot_t               original_fault_type;
3439         struct vm_object_fault_info fault_info = {};
3440         boolean_t               need_collapse = FALSE;
3441         boolean_t               need_retry = FALSE;
3442         boolean_t               *need_retry_ptr = NULL;
3443         int                     object_lock_type = 0;
3444         int                     cur_object_lock_type;
3445         vm_object_t             top_object = VM_OBJECT_NULL;
3446         vm_object_t             written_on_object = VM_OBJECT_NULL;
3447         memory_object_t         written_on_pager = NULL;
3448         vm_object_offset_t      written_on_offset = 0;
3449         int                     throttle_delay;
3450         int                     compressed_count_delta;
3451         int                     grab_options;
3452         boolean_t               need_copy;
3453         boolean_t               need_copy_on_read;
3454         vm_map_offset_t         trace_vaddr;
3455         vm_map_offset_t         trace_real_vaddr;
3456         vm_map_offset_t         real_vaddr;
3457         boolean_t               resilient_media_retry = FALSE;
3458         vm_object_t             resilient_media_object = VM_OBJECT_NULL;
3459         vm_object_offset_t      resilient_media_offset = (vm_object_offset_t)-1;
3460
3461         real_vaddr = vaddr;
3462         trace_real_vaddr = vaddr;
3463         vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
3464
3465         if (map == kernel_map) {
3466                 trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
3467                 trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
3468         } else {
3469                 trace_vaddr = vaddr;
3470         }
3471
3472         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3473             (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3474             ((uint64_t)trace_vaddr >> 32),
3475             trace_vaddr,
3476             (map == kernel_map),
3477             0,
3478             0);
3479
3480         if (get_preemption_level() != 0) {
3481                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3482                     (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3483                     ((uint64_t)trace_vaddr >> 32),
3484                     trace_vaddr,
3485                     KERN_FAILURE,
3486                     0,
3487                     0);
3488
3489                 return KERN_FAILURE;
3490         }
3491
3492         thread_t cthread = current_thread();
3493         boolean_t rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
3494         uint64_t fstart = 0;
3495
3496         if (rtfault) {
3497                 fstart = mach_continuous_time();
3498         }
3499
3500         interruptible_state = thread_interrupt_level(interruptible);
3501
3502         fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
3503
3504         VM_STAT_INCR(faults);
3505         current_task()->faults++;
3506         original_fault_type = fault_type;
3507
3508         need_copy = FALSE;
3509         if (fault_type & VM_PROT_WRITE) {
3510                 need_copy = TRUE;
3511         }
3512
3513         if (need_copy) {
3514                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3515         } else {
3516                 object_lock_type = OBJECT_LOCK_SHARED;
3517         }
3518
3519         cur_object_lock_type = OBJECT_LOCK_SHARED;
3520
3521         if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
3522                 if (compressor_map) {
3523                         if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
3524                                 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
3525                         }
3526                 }
3527         }
3528 RetryFault:
3529         assert(written_on_object == VM_OBJECT_NULL);
3530
3531         /*
3532          * assume we will hit a page in the cache
3533          * otherwise, explicitly override with
3534          * the real fault type once we determine it
3535          */
3536         type_of_fault = DBG_CACHE_HIT_FAULT;
3537
3538         /*
3539          *      Find the backing store object and offset into
3540          *      it to begin the search.
3541          */
3542         fault_type = original_fault_type;
3543         map = original_map;
3544         vm_map_lock_read(map);
3545
3546         if (resilient_media_retry) {
3547                 /*
3548                  * If we have to insert a fake zero-filled page to hide
3549                  * a media failure to provide the real page, we need to
3550                  * resolve any pending copy-on-write on this mapping.
3551                  * VM_PROT_COPY tells vm_map_lookup_locked() to deal
3552                  * with that even if this is not a "write" fault.
3553                  */
3554                 need_copy = TRUE;
3555                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3556         }
3557
3558         kr = vm_map_lookup_locked(&map, vaddr,
3559             (fault_type | (need_copy ? VM_PROT_COPY : 0)),
3560             object_lock_type, &version,
3561             &object, &offset, &prot, &wired,
3562             &fault_info,
3563             &real_map);
3564
3565         if (kr != KERN_SUCCESS) {
3566                 vm_map_unlock_read(map);
3567                 goto done;
3568         }
3569         pmap = real_map->pmap;
3570         fault_info.interruptible = interruptible;
3571         fault_info.stealth = FALSE;
3572         fault_info.io_sync = FALSE;
3573         fault_info.mark_zf_absent = FALSE;
3574         fault_info.batch_pmap_op = FALSE;
3575
3576         if (resilient_media_retry) {
3577                 /*
3578                  * We're retrying this fault after having detected a media
3579                  * failure from a "resilient_media" mapping.
3580                  * Check that the mapping is still pointing at the object
3581                  * that just failed to provide a page.
3582                  */
3583                 assert(resilient_media_object != VM_OBJECT_NULL);
3584                 assert(resilient_media_offset != (vm_object_offset_t)-1);
3585                 if (object != VM_OBJECT_NULL &&
3586                     object == resilient_media_object &&
3587                     offset == resilient_media_offset &&
3588                     fault_info.resilient_media) {
3589                         /*
3590                          * This mapping still points at the same object
3591                          * and is still "resilient_media": proceed in
3592                          * "recovery-from-media-failure" mode, where we'll
3593                          * insert a zero-filled page in the top object.
3594                          */
3595 //                     printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
3596                 } else {
3597                         /* not recovering: reset state */
3598 //                     printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
3599                         resilient_media_retry = FALSE;
3600                         /* release our extra reference on failed object */
3601 //                     printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
3602                         vm_object_deallocate(resilient_media_object);
3603                         resilient_media_object = VM_OBJECT_NULL;
3604                         resilient_media_offset = (vm_object_offset_t)-1;
3605                 }
3606         } else {
3607                 assert(resilient_media_object == VM_OBJECT_NULL);
3608                 resilient_media_offset = (vm_object_offset_t)-1;
3609         }
3610
3611         /*
3612          * If the page is wired, we must fault for the current protection
3613          * value, to avoid further faults.
3614          */
3615         if (wired) {
3616                 fault_type = prot | VM_PROT_WRITE;
3617         }
3618         if (wired || need_copy) {
3619                 /*
3620                  * since we're treating this fault as a 'write'
3621                  * we must hold the top object lock exclusively
3622                  */
3623                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3624                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3625
3626                         if (vm_object_lock_upgrade(object) == FALSE) {
3627                                 /*
3628                                  * couldn't upgrade, so explictly
3629                                  * take the lock exclusively
3630                                  */
3631                                 vm_object_lock(object);
3632                         }
3633                 }
3634         }
3635
3636 #if     VM_FAULT_CLASSIFY
3637         /*
3638          *      Temporary data gathering code
3639          */
3640         vm_fault_classify(object, offset, fault_type);
3641 #endif
3642         /*
3643          *      Fast fault code.  The basic idea is to do as much as
3644          *      possible while holding the map lock and object locks.
3645          *      Busy pages are not used until the object lock has to
3646          *      be dropped to do something (copy, zero fill, pmap enter).
3647          *      Similarly, paging references aren't acquired until that
3648          *      point, and object references aren't used.
3649          *
3650          *      If we can figure out what to do
3651          *      (zero fill, copy on write, pmap enter) while holding
3652          *      the locks, then it gets done.  Otherwise, we give up,
3653          *      and use the original fault path (which doesn't hold
3654          *      the map lock, and relies on busy pages).
3655          *      The give up cases include:
3656          *              - Have to talk to pager.
3657          *              - Page is busy, absent or in error.
3658          *              - Pager has locked out desired access.
3659          *              - Fault needs to be restarted.
3660          *              - Have to push page into copy object.
3661          *
3662          *      The code is an infinite loop that moves one level down
3663          *      the shadow chain each time.  cur_object and cur_offset
3664          *      refer to the current object being examined. object and offset
3665          *      are the original object from the map.  The loop is at the
3666          *      top level if and only if object and cur_object are the same.
3667          *
3668          *      Invariants:  Map lock is held throughout.  Lock is held on
3669          *              original object and cur_object (if different) when
3670          *              continuing or exiting loop.
3671          *
3672          */
3673
3674 #if defined(__arm64__)
3675         /*
3676          * Fail if reading an execute-only page in a
3677          * pmap that enforces execute-only protection.
3678          */
3679         if (fault_type == VM_PROT_READ &&
3680             (prot & VM_PROT_EXECUTE) &&
3681             !(prot & VM_PROT_READ) &&
3682             pmap_enforces_execute_only(pmap)) {
3683                 vm_object_unlock(object);
3684                 vm_map_unlock_read(map);
3685                 if (real_map != map) {
3686                         vm_map_unlock(real_map);
3687                 }
3688                 kr = KERN_PROTECTION_FAILURE;
3689                 goto done;
3690         }
3691 #endif
3692
3693         /*
3694          * If this page is to be inserted in a copy delay object
3695          * for writing, and if the object has a copy, then the
3696          * copy delay strategy is implemented in the slow fault page.
3697          */
3698         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3699             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
3700                 goto handle_copy_delay;
3701         }
3702
3703         cur_object = object;
3704         cur_offset = offset;
3705
3706         grab_options = 0;
3707 #if CONFIG_SECLUDED_MEMORY
3708         if (object->can_grab_secluded) {
3709                 grab_options |= VM_PAGE_GRAB_SECLUDED;
3710         }
3711 #endif /* CONFIG_SECLUDED_MEMORY */
3712
3713         while (TRUE) {
3714                 if (!cur_object->pager_created &&
3715                     cur_object->phys_contiguous) { /* superpage */
3716                         break;
3717                 }
3718
3719                 if (cur_object->blocked_access) {
3720                         /*
3721                          * Access to this VM object has been blocked.
3722                          * Let the slow path handle it.
3723                          */
3724                         break;
3725                 }
3726
3727                 m = vm_page_lookup(cur_object, cur_offset);
3728                 m_object = NULL;
3729
3730                 if (m != VM_PAGE_NULL) {
3731                         m_object = cur_object;
3732
3733                         if (m->vmp_busy) {
3734                                 wait_result_t   result;
3735
3736                                 /*
3737                                  * in order to do the PAGE_ASSERT_WAIT, we must
3738                                  * have object that 'm' belongs to locked exclusively
3739                                  */
3740                                 if (object != cur_object) {
3741                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3742                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3743
3744                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3745                                                         /*
3746                                                          * couldn't upgrade so go do a full retry
3747                                                          * immediately since we can no longer be
3748                                                          * certain about cur_object (since we
3749                                                          * don't hold a reference on it)...
3750                                                          * first drop the top object lock
3751                                                          */
3752                                                         vm_object_unlock(object);
3753
3754                                                         vm_map_unlock_read(map);
3755                                                         if (real_map != map) {
3756                                                                 vm_map_unlock(real_map);
3757                                                         }
3758
3759                                                         goto RetryFault;
3760                                                 }
3761                                         }
3762                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3763                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3764
3765                                         if (vm_object_lock_upgrade(object) == FALSE) {
3766                                                 /*
3767                                                  * couldn't upgrade, so explictly take the lock
3768                                                  * exclusively and go relookup the page since we
3769                                                  * will have dropped the object lock and
3770                                                  * a different thread could have inserted
3771                                                  * a page at this offset
3772                                                  * no need for a full retry since we're
3773                                                  * at the top level of the object chain
3774                                                  */
3775                                                 vm_object_lock(object);
3776
3777                                                 continue;
3778                                         }
3779                                 }
3780                                 if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
3781                                         /*
3782                                          * m->vmp_busy == TRUE and the object is locked exclusively
3783                                          * if m->pageout_queue == TRUE after we acquire the
3784                                          * queues lock, we are guaranteed that it is stable on
3785                                          * the pageout queue and therefore reclaimable
3786                                          *
3787                                          * NOTE: this is only true for the internal pageout queue
3788                                          * in the compressor world
3789                                          */
3790                                         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3791
3792                                         vm_page_lock_queues();
3793
3794                                         if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
3795                                                 vm_pageout_throttle_up(m);
3796                                                 vm_page_unlock_queues();
3797
3798                                                 PAGE_WAKEUP_DONE(m);
3799                                                 goto reclaimed_from_pageout;
3800                                         }
3801                                         vm_page_unlock_queues();
3802                                 }
3803                                 if (object != cur_object) {
3804                                         vm_object_unlock(object);
3805                                 }
3806
3807                                 vm_map_unlock_read(map);
3808                                 if (real_map != map) {
3809                                         vm_map_unlock(real_map);
3810                                 }
3811
3812                                 result = PAGE_ASSERT_WAIT(m, interruptible);
3813
3814                                 vm_object_unlock(cur_object);
3815
3816                                 if (result == THREAD_WAITING) {
3817                                         result = thread_block(THREAD_CONTINUE_NULL);
3818
3819                                         counter(c_vm_fault_page_block_busy_kernel++);
3820                                 }
3821                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
3822                                         goto RetryFault;
3823                                 }
3824
3825                                 kr = KERN_ABORTED;
3826                                 goto done;
3827                         }
3828 reclaimed_from_pageout:
3829                         if (m->vmp_laundry) {
3830                                 if (object != cur_object) {
3831                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3832                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3833
3834                                                 vm_object_unlock(object);
3835                                                 vm_object_unlock(cur_object);
3836
3837                                                 vm_map_unlock_read(map);
3838                                                 if (real_map != map) {
3839                                                         vm_map_unlock(real_map);
3840                                                 }
3841
3842                                                 goto RetryFault;
3843                                         }
3844                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3845                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3846
3847                                         if (vm_object_lock_upgrade(object) == FALSE) {
3848                                                 /*
3849                                                  * couldn't upgrade, so explictly take the lock
3850                                                  * exclusively and go relookup the page since we
3851                                                  * will have dropped the object lock and
3852                                                  * a different thread could have inserted
3853                                                  * a page at this offset
3854                                                  * no need for a full retry since we're
3855                                                  * at the top level of the object chain
3856                                                  */
3857                                                 vm_object_lock(object);
3858
3859                                                 continue;
3860                                         }
3861                                 }
3862                                 vm_pageout_steal_laundry(m, FALSE);
3863                         }
3864
3865                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3866                                 /*
3867                                  * Guard page: let the slow path deal with it
3868                                  */
3869                                 break;
3870                         }
3871                         if (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
3872                                 /*
3873                                  * Unusual case... let the slow path deal with it
3874                                  */
3875                                 break;
3876                         }
3877                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
3878                                 if (object != cur_object) {
3879                                         vm_object_unlock(object);
3880                                 }
3881                                 vm_map_unlock_read(map);
3882                                 if (real_map != map) {
3883                                         vm_map_unlock(real_map);
3884                                 }
3885                                 vm_object_unlock(cur_object);
3886                                 kr = KERN_MEMORY_ERROR;
3887                                 goto done;
3888                         }
3889                         assert(m_object == VM_PAGE_OBJECT(m));
3890
3891                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) ||
3892                             (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
3893 upgrade_lock_and_retry:
3894                                 /*
3895                                  * We might need to validate this page
3896                                  * against its code signature, so we
3897                                  * want to hold the VM object exclusively.
3898                                  */
3899                                 if (object != cur_object) {
3900                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3901                                                 vm_object_unlock(object);
3902                                                 vm_object_unlock(cur_object);
3903
3904                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3905
3906                                                 vm_map_unlock_read(map);
3907                                                 if (real_map != map) {
3908                                                         vm_map_unlock(real_map);
3909                                                 }
3910
3911                                                 goto RetryFault;
3912                                         }
3913                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3914                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3915
3916                                         if (vm_object_lock_upgrade(object) == FALSE) {
3917                                                 /*
3918                                                  * couldn't upgrade, so explictly take the lock
3919                                                  * exclusively and go relookup the page since we
3920                                                  * will have dropped the object lock and
3921                                                  * a different thread could have inserted
3922                                                  * a page at this offset
3923                                                  * no need for a full retry since we're
3924                                                  * at the top level of the object chain
3925                                                  */
3926                                                 vm_object_lock(object);
3927
3928                                                 continue;
3929                                         }
3930                                 }
3931                         }
3932                         /*
3933                          *      Two cases of map in faults:
3934                          *          - At top level w/o copy object.
3935                          *          - Read fault anywhere.
3936                          *              --> must disallow write.
3937                          */
3938
3939                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3940                                 goto FastPmapEnter;
3941                         }
3942
3943                         if (!need_copy &&
3944                             !fault_info.no_copy_on_read &&
3945                             cur_object != object &&
3946                             !cur_object->internal &&
3947                             !cur_object->pager_trusted &&
3948                             vm_protect_privileged_from_untrusted &&
3949                             !((prot & VM_PROT_EXECUTE) &&
3950                             cur_object->code_signed &&
3951                             cs_process_enforcement(NULL)) &&
3952                             current_proc_is_privileged()) {
3953                                 /*
3954                                  * We're faulting on a page in "object" and
3955                                  * went down the shadow chain to "cur_object"
3956                                  * to find out that "cur_object"'s pager
3957                                  * is not "trusted", i.e. we can not trust it
3958                                  * to always return the same contents.
3959                                  * Since the target is a "privileged" process,
3960                                  * let's treat this as a copy-on-read fault, as
3961                                  * if it was a copy-on-write fault.
3962                                  * Once "object" gets a copy of this page, it
3963                                  * won't have to rely on "cur_object" to
3964                                  * provide the contents again.
3965                                  *
3966                                  * This is done by setting "need_copy" and
3967                                  * retrying the fault from the top with the
3968                                  * appropriate locking.
3969                                  *
3970                                  * Special case: if the mapping is executable
3971                                  * and the untrusted object is code-signed and
3972                                  * the process is "cs_enforced", we do not
3973                                  * copy-on-read because that would break
3974                                  * code-signing enforcement expectations (an
3975                                  * executable page must belong to a code-signed
3976                                  * object) and we can rely on code-signing
3977                                  * to re-validate the page if it gets evicted
3978                                  * and paged back in.
3979                                  */
3980 //                              printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
3981                                 vm_copied_on_read++;
3982                                 need_copy = TRUE;
3983
3984                                 vm_object_unlock(object);
3985                                 vm_object_unlock(cur_object);
3986                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3987                                 vm_map_unlock_read(map);
3988                                 if (real_map != map) {
3989                                         vm_map_unlock(real_map);
3990                                 }
3991                                 goto RetryFault;
3992                         }
3993
3994                         if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
3995                                 if (!pmap_has_prot_policy(prot)) {
3996                                         prot &= ~VM_PROT_WRITE;
3997                                 } else {
3998                                         /*
3999                                          * For a protection that the pmap cares
4000                                          * about, we must hand over the full
4001                                          * set of protections (so that the pmap
4002                                          * layer can apply any desired policy).
4003                                          * This means that cs_bypass must be
4004                                          * set, as this can force us to pass
4005                                          * RWX.
4006                                          */
4007                                         assert(fault_info.cs_bypass);
4008                                 }
4009
4010                                 if (object != cur_object) {
4011                                         /*
4012                                          * We still need to hold the top object
4013                                          * lock here to prevent a race between
4014                                          * a read fault (taking only "shared"
4015                                          * locks) and a write fault (taking
4016                                          * an "exclusive" lock on the top
4017                                          * object.
4018                                          * Otherwise, as soon as we release the
4019                                          * top lock, the write fault could
4020                                          * proceed and actually complete before
4021                                          * the read fault, and the copied page's
4022                                          * translation could then be overwritten
4023                                          * by the read fault's translation for
4024                                          * the original page.
4025                                          *
4026                                          * Let's just record what the top object
4027                                          * is and we'll release it later.
4028                                          */
4029                                         top_object = object;
4030
4031                                         /*
4032                                          * switch to the object that has the new page
4033                                          */
4034                                         object = cur_object;
4035                                         object_lock_type = cur_object_lock_type;
4036                                 }
4037 FastPmapEnter:
4038                                 assert(m_object == VM_PAGE_OBJECT(m));
4039
4040                                 /*
4041                                  * prepare for the pmap_enter...
4042                                  * object and map are both locked
4043                                  * m contains valid data
4044                                  * object == m->vmp_object
4045                                  * cur_object == NULL or it's been unlocked
4046                                  * no paging references on either object or cur_object
4047                                  */
4048                                 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
4049                                         need_retry_ptr = &need_retry;
4050                                 } else {
4051                                         need_retry_ptr = NULL;
4052                                 }
4053
4054                                 if (caller_pmap) {
4055                                         kr = vm_fault_enter(m,
4056                                             caller_pmap,
4057                                             caller_pmap_addr,
4058                                             prot,
4059                                             caller_prot,
4060                                             wired,
4061                                             change_wiring,
4062                                             wire_tag,
4063                                             &fault_info,
4064                                             need_retry_ptr,
4065                                             &type_of_fault);
4066                                 } else {
4067                                         kr = vm_fault_enter(m,
4068                                             pmap,
4069                                             vaddr,
4070                                             prot,
4071                                             caller_prot,
4072                                             wired,
4073                                             change_wiring,
4074                                             wire_tag,
4075                                             &fault_info,
4076                                             need_retry_ptr,
4077                                             &type_of_fault);
4078                                 }
4079                                 {
4080                                         int     event_code = 0;
4081
4082                                         if (m_object->internal) {
4083                                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
4084                                         } else if (m_object->object_is_shared_cache) {
4085                                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
4086                                         } else {
4087                                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
4088                                         }
4089
4090                                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
4091
4092                                         DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
4093                                 }
4094                                 if (kr == KERN_SUCCESS &&
4095                                     physpage_p != NULL) {
4096                                         /* for vm_map_wire_and_extract() */
4097                                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4098                                         if (prot & VM_PROT_WRITE) {
4099                                                 vm_object_lock_assert_exclusive(m_object);
4100                                                 m->vmp_dirty = TRUE;
4101                                         }
4102                                 }
4103
4104                                 if (top_object != VM_OBJECT_NULL) {
4105                                         /*
4106                                          * It's safe to drop the top object
4107                                          * now that we've done our
4108                                          * vm_fault_enter().  Any other fault
4109                                          * in progress for that virtual
4110                                          * address will either find our page
4111                                          * and translation or put in a new page
4112                                          * and translation.
4113                                          */
4114                                         vm_object_unlock(top_object);
4115                                         top_object = VM_OBJECT_NULL;
4116                                 }
4117
4118                                 if (need_collapse == TRUE) {
4119                                         vm_object_collapse(object, offset, TRUE);
4120                                 }
4121
4122                                 if (need_retry == FALSE &&
4123                                     (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
4124                                         /*
4125                                          * evaluate access pattern and update state
4126                                          * vm_fault_deactivate_behind depends on the
4127                                          * state being up to date
4128                                          */
4129                                         vm_fault_is_sequential(m_object, cur_offset, fault_info.behavior);
4130
4131                                         vm_fault_deactivate_behind(m_object, cur_offset, fault_info.behavior);
4132                                 }
4133                                 /*
4134                                  * That's it, clean up and return.
4135                                  */
4136                                 if (m->vmp_busy) {
4137                                         PAGE_WAKEUP_DONE(m);
4138                                 }
4139
4140                                 if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
4141                                         vm_object_paging_begin(m_object);
4142
4143                                         assert(written_on_object == VM_OBJECT_NULL);
4144                                         written_on_object = m_object;
4145                                         written_on_pager = m_object->pager;
4146                                         written_on_offset = m_object->paging_offset + m->vmp_offset;
4147                                 }
4148                                 vm_object_unlock(object);
4149
4150                                 vm_map_unlock_read(map);
4151                                 if (real_map != map) {
4152                                         vm_map_unlock(real_map);
4153                                 }
4154
4155                                 if (need_retry == TRUE) {
4156                                         /*
4157                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
4158                                          * at this point we don't hold any locks so it's safe
4159                                          * to ask the pmap layer to expand the page table to
4160                                          * accommodate this mapping... once expanded, we'll
4161                                          * re-drive the fault which should result in vm_fault_enter
4162                                          * being able to successfully enter the mapping this time around
4163                                          */
4164                                         (void)pmap_enter_options(
4165                                                 pmap, vaddr, 0, 0, 0, 0, 0,
4166                                                 PMAP_OPTIONS_NOENTER, NULL);
4167
4168                                         need_retry = FALSE;
4169                                         goto RetryFault;
4170                                 }
4171                                 goto done;
4172                         }
4173                         /*
4174                          * COPY ON WRITE FAULT
4175                          */
4176                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4177
4178                         /*
4179                          * If objects match, then
4180                          * object->copy must not be NULL (else control
4181                          * would be in previous code block), and we
4182                          * have a potential push into the copy object
4183                          * with which we can't cope with here.
4184                          */
4185                         if (cur_object == object) {
4186                                 /*
4187                                  * must take the slow path to
4188                                  * deal with the copy push
4189                                  */
4190                                 break;
4191                         }
4192
4193                         /*
4194                          * This is now a shadow based copy on write
4195                          * fault -- it requires a copy up the shadow
4196                          * chain.
4197                          */
4198                         assert(m_object == VM_PAGE_OBJECT(m));
4199
4200                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4201                             VM_FAULT_NEED_CS_VALIDATION(NULL, m, m_object)) {
4202                                 goto upgrade_lock_and_retry;
4203                         }
4204
4205                         /*
4206                          * Allocate a page in the original top level
4207                          * object. Give up if allocate fails.  Also
4208                          * need to remember current page, as it's the
4209                          * source of the copy.
4210                          *
4211                          * at this point we hold locks on both
4212                          * object and cur_object... no need to take
4213                          * paging refs or mark pages BUSY since
4214                          * we don't drop either object lock until
4215                          * the page has been copied and inserted
4216                          */
4217                         cur_m = m;
4218                         m = vm_page_grab_options(grab_options);
4219                         m_object = NULL;
4220
4221                         if (m == VM_PAGE_NULL) {
4222                                 /*
4223                                  * no free page currently available...
4224                                  * must take the slow path
4225                                  */
4226                                 break;
4227                         }
4228                         /*
4229                          * Now do the copy.  Mark the source page busy...
4230                          *
4231                          *      NOTE: This code holds the map lock across
4232                          *      the page copy.
4233                          */
4234                         vm_page_copy(cur_m, m);
4235                         vm_page_insert(m, object, offset);
4236                         m_object = object;
4237                         SET_PAGE_DIRTY(m, FALSE);
4238
4239                         /*
4240                          * Now cope with the source page and object
4241                          */
4242                         if (object->ref_count > 1 && cur_m->vmp_pmapped) {
4243                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4244                         }
4245
4246                         if (cur_m->vmp_clustered) {
4247                                 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4248                                 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4249                                 vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4250                         }
4251                         need_collapse = TRUE;
4252
4253                         if (!cur_object->internal &&
4254                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4255                                 /*
4256                                  * The object from which we've just
4257                                  * copied a page is most probably backed
4258                                  * by a vnode.  We don't want to waste too
4259                                  * much time trying to collapse the VM objects
4260                                  * and create a bottleneck when several tasks
4261                                  * map the same file.
4262                                  */
4263                                 if (cur_object->copy == object) {
4264                                         /*
4265                                          * Shared mapping or no COW yet.
4266                                          * We can never collapse a copy
4267                                          * object into its backing object.
4268                                          */
4269                                         need_collapse = FALSE;
4270                                 } else if (cur_object->copy == object->shadow &&
4271                                     object->shadow->resident_page_count == 0) {
4272                                         /*
4273                                          * Shared mapping after a COW occurred.
4274                                          */
4275                                         need_collapse = FALSE;
4276                                 }
4277                         }
4278                         vm_object_unlock(cur_object);
4279
4280                         if (need_collapse == FALSE) {
4281                                 vm_fault_collapse_skipped++;
4282                         }
4283                         vm_fault_collapse_total++;
4284
4285                         type_of_fault = DBG_COW_FAULT;
4286                         VM_STAT_INCR(cow_faults);
4287                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4288                         current_task()->cow_faults++;
4289
4290                         goto FastPmapEnter;
4291                 } else {
4292                         /*
4293                          * No page at cur_object, cur_offset... m == NULL
4294                          */
4295                         if (cur_object->pager_created) {
4296                                 int     compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4297
4298                                 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4299                                         int             my_fault_type;
4300                                         int             c_flags = C_DONT_BLOCK;
4301                                         boolean_t       insert_cur_object = FALSE;
4302
4303                                         /*
4304                                          * May have to talk to a pager...
4305                                          * if so, take the slow path by
4306                                          * doing a 'break' from the while (TRUE) loop
4307                                          *
4308                                          * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4309                                          * if the compressor is active and the page exists there
4310                                          */
4311                                         if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
4312                                                 break;
4313                                         }
4314
4315                                         if (map == kernel_map || real_map == kernel_map) {
4316                                                 /*
4317                                                  * can't call into the compressor with the kernel_map
4318                                                  * lock held, since the compressor may try to operate
4319                                                  * on the kernel map in order to return an empty c_segment
4320                                                  */
4321                                                 break;
4322                                         }
4323                                         if (object != cur_object) {
4324                                                 if (fault_type & VM_PROT_WRITE) {
4325                                                         c_flags |= C_KEEP;
4326                                                 } else {
4327                                                         insert_cur_object = TRUE;
4328                                                 }
4329                                         }
4330                                         if (insert_cur_object == TRUE) {
4331                                                 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4332                                                         cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4333
4334                                                         if (vm_object_lock_upgrade(cur_object) == FALSE) {
4335                                                                 /*
4336                                                                  * couldn't upgrade so go do a full retry
4337                                                                  * immediately since we can no longer be
4338                                                                  * certain about cur_object (since we
4339                                                                  * don't hold a reference on it)...
4340                                                                  * first drop the top object lock
4341                                                                  */
4342                                                                 vm_object_unlock(object);
4343
4344                                                                 vm_map_unlock_read(map);
4345                                                                 if (real_map != map) {
4346                                                                         vm_map_unlock(real_map);
4347                                                                 }
4348
4349                                                                 goto RetryFault;
4350                                                         }
4351                                                 }
4352                                         } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4353                                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4354
4355                                                 if (object != cur_object) {
4356                                                         /*
4357                                                          * we can't go for the upgrade on the top
4358                                                          * lock since the upgrade may block waiting
4359                                                          * for readers to drain... since we hold
4360                                                          * cur_object locked at this point, waiting
4361                                                          * for the readers to drain would represent
4362                                                          * a lock order inversion since the lock order
4363                                                          * for objects is the reference order in the
4364                                                          * shadown chain
4365                                                          */
4366                                                         vm_object_unlock(object);
4367                                                         vm_object_unlock(cur_object);
4368
4369                                                         vm_map_unlock_read(map);
4370                                                         if (real_map != map) {
4371                                                                 vm_map_unlock(real_map);
4372                                                         }
4373
4374                                                         goto RetryFault;
4375                                                 }
4376                                                 if (vm_object_lock_upgrade(object) == FALSE) {
4377                                                         /*
4378                                                          * couldn't upgrade, so explictly take the lock
4379                                                          * exclusively and go relookup the page since we
4380                                                          * will have dropped the object lock and
4381                                                          * a different thread could have inserted
4382                                                          * a page at this offset
4383                                                          * no need for a full retry since we're
4384                                                          * at the top level of the object chain
4385                                                          */
4386                                                         vm_object_lock(object);
4387
4388                                                         continue;
4389                                                 }
4390                                         }
4391                                         m = vm_page_grab_options(grab_options);
4392                                         m_object = NULL;
4393
4394                                         if (m == VM_PAGE_NULL) {
4395                                                 /*
4396                                                  * no free page currently available...
4397                                                  * must take the slow path
4398                                                  */
4399                                                 break;
4400                                         }
4401
4402                                         /*
4403                                          * The object is and remains locked
4404                                          * so no need to take a
4405                                          * "paging_in_progress" reference.
4406                                          */
4407                                         boolean_t shared_lock;
4408                                         if ((object == cur_object &&
4409                                             object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4410                                             (object != cur_object &&
4411                                             cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4412                                                 shared_lock = FALSE;
4413                                         } else {
4414                                                 shared_lock = TRUE;
4415                                         }
4416
4417                                         kr = vm_compressor_pager_get(
4418                                                 cur_object->pager,
4419                                                 (cur_offset +
4420                                                 cur_object->paging_offset),
4421                                                 VM_PAGE_GET_PHYS_PAGE(m),
4422                                                 &my_fault_type,
4423                                                 c_flags,
4424                                                 &compressed_count_delta);
4425
4426                                         vm_compressor_pager_count(
4427                                                 cur_object->pager,
4428                                                 compressed_count_delta,
4429                                                 shared_lock,
4430                                                 cur_object);
4431
4432                                         if (kr != KERN_SUCCESS) {
4433                                                 vm_page_release(m, FALSE);
4434                                                 m = VM_PAGE_NULL;
4435                                                 break;
4436                                         }
4437                                         m->vmp_dirty = TRUE;
4438
4439                                         /*
4440                                          * If the object is purgeable, its
4441                                          * owner's purgeable ledgers will be
4442                                          * updated in vm_page_insert() but the
4443                                          * page was also accounted for in a
4444                                          * "compressed purgeable" ledger, so
4445                                          * update that now.
4446                                          */
4447                                         if (object != cur_object &&
4448                                             !insert_cur_object) {
4449                                                 /*
4450                                                  * We're not going to insert
4451                                                  * the decompressed page into
4452                                                  * the object it came from.
4453                                                  *
4454                                                  * We're dealing with a
4455                                                  * copy-on-write fault on
4456                                                  * "object".
4457                                                  * We're going to decompress
4458                                                  * the page directly into the
4459                                                  * target "object" while
4460                                                  * keepin the compressed
4461                                                  * page for "cur_object", so
4462                                                  * no ledger update in that
4463                                                  * case.
4464                                                  */
4465                                         } else if (((cur_object->purgable ==
4466                                             VM_PURGABLE_DENY) &&
4467                                             (!cur_object->vo_ledger_tag)) ||
4468                                             (cur_object->vo_owner ==
4469                                             NULL)) {
4470                                                 /*
4471                                                  * "cur_object" is not purgeable
4472                                                  * and is not ledger-taged, or
4473                                                  * there's no owner for it,
4474                                                  * so no owner's ledgers to
4475                                                  * update.
4476                                                  */
4477                                         } else {
4478                                                 /*
4479                                                  * One less compressed
4480                                                  * purgeable/tagged page for
4481                                                  * cur_object's owner.
4482                                                  */
4483                                                 vm_object_owner_compressed_update(
4484                                                         cur_object,
4485                                                         -1);
4486                                         }
4487
4488                                         if (insert_cur_object) {
4489                                                 vm_page_insert(m, cur_object, cur_offset);
4490                                                 m_object = cur_object;
4491                                         } else {
4492                                                 vm_page_insert(m, object, offset);
4493                                                 m_object = object;
4494                                         }
4495
4496                                         if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4497                                                 /*
4498                                                  * If the page is not cacheable,
4499                                                  * we can't let its contents
4500                                                  * linger in the data cache
4501                                                  * after the decompression.
4502                                                  */
4503                                                 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
4504                                         }
4505
4506                                         type_of_fault = my_fault_type;
4507
4508                                         VM_STAT_DECOMPRESSIONS();
4509
4510                                         if (cur_object != object) {
4511                                                 if (insert_cur_object) {
4512                                                         top_object = object;
4513                                                         /*
4514                                                          * switch to the object that has the new page
4515                                                          */
4516                                                         object = cur_object;
4517                                                         object_lock_type = cur_object_lock_type;
4518                                                 } else {
4519                                                         vm_object_unlock(cur_object);
4520                                                         cur_object = object;
4521                                                 }
4522                                         }
4523                                         goto FastPmapEnter;
4524                                 }
4525                                 /*
4526                                  * existence map present and indicates
4527                                  * that the pager doesn't have this page
4528                                  */
4529                         }
4530                         if (cur_object->shadow == VM_OBJECT_NULL ||
4531                             resilient_media_retry) {
4532                                 /*
4533                                  * Zero fill fault.  Page gets
4534                                  * inserted into the original object.
4535                                  */
4536                                 if (cur_object->shadow_severed ||
4537                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
4538                                     cur_object == compressor_object ||
4539                                     cur_object == kernel_object ||
4540                                     cur_object == vm_submap_object) {
4541                                         if (object != cur_object) {
4542                                                 vm_object_unlock(cur_object);
4543                                         }
4544                                         vm_object_unlock(object);
4545
4546                                         vm_map_unlock_read(map);
4547                                         if (real_map != map) {
4548                                                 vm_map_unlock(real_map);
4549                                         }
4550
4551                                         kr = KERN_MEMORY_ERROR;
4552                                         goto done;
4553                                 }
4554                                 if (cur_object != object) {
4555                                         vm_object_unlock(cur_object);
4556
4557                                         cur_object = object;
4558                                 }
4559                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
4560                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4561
4562                                         if (vm_object_lock_upgrade(object) == FALSE) {
4563                                                 /*
4564                                                  * couldn't upgrade so do a full retry on the fault
4565                                                  * since we dropped the object lock which
4566                                                  * could allow another thread to insert
4567                                                  * a page at this offset
4568                                                  */
4569                                                 vm_map_unlock_read(map);
4570                                                 if (real_map != map) {
4571                                                         vm_map_unlock(real_map);
4572                                                 }
4573
4574                                                 goto RetryFault;
4575                                         }
4576                                 }
4577                                 if (!object->internal) {
4578                                         panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
4579                                 }
4580                                 m = vm_page_alloc(object, offset);
4581                                 m_object = NULL;
4582
4583                                 if (m == VM_PAGE_NULL) {
4584                                         /*
4585                                          * no free page currently available...
4586                                          * must take the slow path
4587                                          */
4588                                         break;
4589                                 }
4590                                 m_object = object;
4591
4592                                 /*
4593                                  * Now zero fill page...
4594                                  * the page is probably going to
4595                                  * be written soon, so don't bother
4596                                  * to clear the modified bit
4597                                  *
4598                                  *   NOTE: This code holds the map
4599                                  *   lock across the zero fill.
4600                                  */
4601                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4602
4603                                 goto FastPmapEnter;
4604                         }
4605                         /*
4606                          * On to the next level in the shadow chain
4607                          */
4608                         cur_offset += cur_object->vo_shadow_offset;
4609                         new_object = cur_object->shadow;
4610
4611                         /*
4612                          * take the new_object's lock with the indicated state
4613                          */
4614                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4615                                 vm_object_lock_shared(new_object);
4616                         } else {
4617                                 vm_object_lock(new_object);
4618                         }
4619
4620                         if (cur_object != object) {
4621                                 vm_object_unlock(cur_object);
4622                         }
4623
4624                         cur_object = new_object;
4625
4626                         continue;
4627                 }
4628         }
4629         /*
4630          * Cleanup from fast fault failure.  Drop any object
4631          * lock other than original and drop map lock.
4632          */
4633         if (object != cur_object) {
4634                 vm_object_unlock(cur_object);
4635         }
4636
4637         /*
4638          * must own the object lock exclusively at this point
4639          */
4640         if (object_lock_type == OBJECT_LOCK_SHARED) {
4641                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4642
4643                 if (vm_object_lock_upgrade(object) == FALSE) {
4644                         /*
4645                          * couldn't upgrade, so explictly
4646                          * take the lock exclusively
4647                          * no need to retry the fault at this
4648                          * point since "vm_fault_page" will
4649                          * completely re-evaluate the state
4650                          */
4651                         vm_object_lock(object);
4652                 }
4653         }
4654
4655 handle_copy_delay:
4656         vm_map_unlock_read(map);
4657         if (real_map != map) {
4658                 vm_map_unlock(real_map);
4659         }
4660
4661         if (__improbable(object == compressor_object ||
4662             object == kernel_object ||
4663             object == vm_submap_object)) {
4664                 /*
4665                  * These objects are explicitly managed and populated by the
4666                  * kernel.  The virtual ranges backed by these objects should
4667                  * either have wired pages or "holes" that are not supposed to
4668                  * be accessed at all until they get explicitly populated.
4669                  * We should never have to resolve a fault on a mapping backed
4670                  * by one of these VM objects and providing a zero-filled page
4671                  * would be wrong here, so let's fail the fault and let the
4672                  * caller crash or recover.
4673                  */
4674                 vm_object_unlock(object);
4675                 kr = KERN_MEMORY_ERROR;
4676                 goto done;
4677         }
4678
4679         assert(object != compressor_object);
4680         assert(object != kernel_object);
4681         assert(object != vm_submap_object);
4682
4683         if (resilient_media_retry) {
4684                 /*
4685                  * We could get here if we failed to get a free page
4686                  * to zero-fill and had to take the slow path again.
4687                  * Reset our "recovery-from-failed-media" state.
4688                  */
4689                 assert(resilient_media_object != VM_OBJECT_NULL);
4690                 assert(resilient_media_offset != (vm_object_offset_t)-1);
4691                 /* release our extra reference on failed object */
4692 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4693                 vm_object_deallocate(resilient_media_object);
4694                 resilient_media_object = VM_OBJECT_NULL;
4695                 resilient_media_offset = (vm_object_offset_t)-1;
4696                 resilient_media_retry = FALSE;
4697         }
4698
4699         /*
4700          * Make a reference to this object to
4701          * prevent its disposal while we are messing with
4702          * it.  Once we have the reference, the map is free
4703          * to be diddled.  Since objects reference their
4704          * shadows (and copies), they will stay around as well.
4705          */
4706         vm_object_reference_locked(object);
4707         vm_object_paging_begin(object);
4708
4709         set_thread_pagein_error(cthread, 0);
4710         error_code = 0;
4711
4712         result_page = VM_PAGE_NULL;
4713         kr = vm_fault_page(object, offset, fault_type,
4714             (change_wiring && !wired),
4715             FALSE,                /* page not looked up */
4716             &prot, &result_page, &top_page,
4717             &type_of_fault,
4718             &error_code, map->no_zero_fill,
4719             FALSE, &fault_info);
4720
4721         /*
4722          * if kr != VM_FAULT_SUCCESS, then the paging reference
4723          * has been dropped and the object unlocked... the ref_count
4724          * is still held
4725          *
4726          * if kr == VM_FAULT_SUCCESS, then the paging reference
4727          * is still held along with the ref_count on the original object
4728          *
4729          *      the object is returned locked with a paging reference
4730          *
4731          *      if top_page != NULL, then it's BUSY and the
4732          *      object it belongs to has a paging reference
4733          *      but is returned unlocked
4734          */
4735         if (kr != VM_FAULT_SUCCESS &&
4736             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4737                 if (kr == VM_FAULT_MEMORY_ERROR &&
4738                     fault_info.resilient_media) {
4739                         assertf(object->internal, "object %p", object);
4740                         /*
4741                          * This fault failed but the mapping was
4742                          * "media resilient", so we'll retry the fault in
4743                          * recovery mode to get a zero-filled page in the
4744                          * top object.
4745                          * Keep the reference on the failing object so
4746                          * that we can check that the mapping is still
4747                          * pointing to it when we retry the fault.
4748                          */
4749 //                     printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
4750                         assert(!resilient_media_retry); /* no double retry */
4751                         assert(resilient_media_object == VM_OBJECT_NULL);
4752                         assert(resilient_media_offset == (vm_object_offset_t)-1);
4753                         resilient_media_retry = TRUE;
4754                         resilient_media_object = object;
4755                         resilient_media_offset = offset;
4756 //                     printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
4757                         goto RetryFault;
4758                 } else {
4759                         /*
4760                          * we didn't succeed, lose the object reference
4761                          * immediately.
4762                          */
4763                         vm_object_deallocate(object);
4764                         object = VM_OBJECT_NULL; /* no longer valid */
4765                 }
4766
4767                 /*
4768                  * See why we failed, and take corrective action.
4769                  */
4770                 switch (kr) {
4771                 case VM_FAULT_MEMORY_SHORTAGE:
4772                         if (vm_page_wait((change_wiring) ?
4773                             THREAD_UNINT :
4774                             THREAD_ABORTSAFE)) {
4775                                 goto RetryFault;
4776                         }
4777                 /*
4778                  * fall thru
4779                  */
4780                 case VM_FAULT_INTERRUPTED:
4781                         kr = KERN_ABORTED;
4782                         goto done;
4783                 case VM_FAULT_RETRY:
4784                         goto RetryFault;
4785                 case VM_FAULT_MEMORY_ERROR:
4786                         if (error_code) {
4787                                 kr = error_code;
4788                         } else {
4789                                 kr = KERN_MEMORY_ERROR;
4790                         }
4791                         goto done;
4792                 default:
4793                         panic("vm_fault: unexpected error 0x%x from "
4794                             "vm_fault_page()\n", kr);
4795                 }
4796         }
4797         m = result_page;
4798         m_object = NULL;
4799
4800         if (m != VM_PAGE_NULL) {
4801                 m_object = VM_PAGE_OBJECT(m);
4802                 assert((change_wiring && !wired) ?
4803                     (top_page == VM_PAGE_NULL) :
4804                     ((top_page == VM_PAGE_NULL) == (m_object == object)));
4805         }
4806
4807         /*
4808          * What to do with the resulting page from vm_fault_page
4809          * if it doesn't get entered into the physical map:
4810          */
4811 #define RELEASE_PAGE(m)                                 \
4812         MACRO_BEGIN                                     \
4813         PAGE_WAKEUP_DONE(m);                            \
4814         if ( !VM_PAGE_PAGEABLE(m)) {                    \
4815                 vm_page_lockspin_queues();              \
4816                 if ( !VM_PAGE_PAGEABLE(m))              \
4817                         vm_page_activate(m);            \
4818                 vm_page_unlock_queues();                \
4819         }                                               \
4820         MACRO_END
4821
4822
4823         object_locks_dropped = FALSE;
4824         /*
4825          * We must verify that the maps have not changed
4826          * since our last lookup. vm_map_verify() needs the
4827          * map lock (shared) but we are holding object locks.
4828          * So we do a try_lock() first and, if that fails, we
4829          * drop the object locks and go in for the map lock again.
4830          */
4831         if (!vm_map_try_lock_read(original_map)) {
4832                 if (m != VM_PAGE_NULL) {
4833                         old_copy_object = m_object->copy;
4834                         vm_object_unlock(m_object);
4835                 } else {
4836                         old_copy_object = VM_OBJECT_NULL;
4837                         vm_object_unlock(object);
4838                 }
4839
4840                 object_locks_dropped = TRUE;
4841
4842                 vm_map_lock_read(original_map);
4843         }
4844
4845         if ((map != original_map) || !vm_map_verify(map, &version)) {
4846                 if (object_locks_dropped == FALSE) {
4847                         if (m != VM_PAGE_NULL) {
4848                                 old_copy_object = m_object->copy;
4849                                 vm_object_unlock(m_object);
4850                         } else {
4851                                 old_copy_object = VM_OBJECT_NULL;
4852                                 vm_object_unlock(object);
4853                         }
4854
4855                         object_locks_dropped = TRUE;
4856                 }
4857
4858                 /*
4859                  * no object locks are held at this point
4860                  */
4861                 vm_object_t             retry_object;
4862                 vm_object_offset_t      retry_offset;
4863                 vm_prot_t               retry_prot;
4864
4865                 /*
4866                  * To avoid trying to write_lock the map while another
4867                  * thread has it read_locked (in vm_map_pageable), we
4868                  * do not try for write permission.  If the page is
4869                  * still writable, we will get write permission.  If it
4870                  * is not, or has been marked needs_copy, we enter the
4871                  * mapping without write permission, and will merely
4872                  * take another fault.
4873                  */
4874                 map = original_map;
4875
4876                 kr = vm_map_lookup_locked(&map, vaddr,
4877                     fault_type & ~VM_PROT_WRITE,
4878                     OBJECT_LOCK_EXCLUSIVE, &version,
4879                     &retry_object, &retry_offset, &retry_prot,
4880                     &wired,
4881                     &fault_info,
4882                     &real_map);
4883                 pmap = real_map->pmap;
4884
4885                 if (kr != KERN_SUCCESS) {
4886                         vm_map_unlock_read(map);
4887
4888                         if (m != VM_PAGE_NULL) {
4889                                 assert(VM_PAGE_OBJECT(m) == m_object);
4890
4891                                 /*
4892                                  * retake the lock so that
4893                                  * we can drop the paging reference
4894                                  * in vm_fault_cleanup and do the
4895                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4896                                  */
4897                                 vm_object_lock(m_object);
4898
4899                                 RELEASE_PAGE(m);
4900
4901                                 vm_fault_cleanup(m_object, top_page);
4902                         } else {
4903                                 /*
4904                                  * retake the lock so that
4905                                  * we can drop the paging reference
4906                                  * in vm_fault_cleanup
4907                                  */
4908                                 vm_object_lock(object);
4909
4910                                 vm_fault_cleanup(object, top_page);
4911                         }
4912                         vm_object_deallocate(object);
4913
4914                         goto done;
4915                 }
4916                 vm_object_unlock(retry_object);
4917
4918                 if ((retry_object != object) || (retry_offset != offset)) {
4919                         vm_map_unlock_read(map);
4920                         if (real_map != map) {
4921                                 vm_map_unlock(real_map);
4922                         }
4923
4924                         if (m != VM_PAGE_NULL) {
4925                                 assert(VM_PAGE_OBJECT(m) == m_object);
4926
4927                                 /*
4928                                  * retake the lock so that
4929                                  * we can drop the paging reference
4930                                  * in vm_fault_cleanup and do the
4931                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4932                                  */
4933                                 vm_object_lock(m_object);
4934
4935                                 RELEASE_PAGE(m);
4936
4937                                 vm_fault_cleanup(m_object, top_page);
4938                         } else {
4939                                 /*
4940                                  * retake the lock so that
4941                                  * we can drop the paging reference
4942                                  * in vm_fault_cleanup
4943                                  */
4944                                 vm_object_lock(object);
4945
4946                                 vm_fault_cleanup(object, top_page);
4947                         }
4948                         vm_object_deallocate(object);
4949
4950                         goto RetryFault;
4951                 }
4952                 /*
4953                  * Check whether the protection has changed or the object
4954                  * has been copied while we left the map unlocked.
4955                  */
4956                 if (pmap_has_prot_policy(retry_prot)) {
4957                         /* If the pmap layer cares, pass the full set. */
4958                         prot = retry_prot;
4959                 } else {
4960                         prot &= retry_prot;
4961                 }
4962         }
4963
4964         if (object_locks_dropped == TRUE) {
4965                 if (m != VM_PAGE_NULL) {
4966                         vm_object_lock(m_object);
4967
4968                         if (m_object->copy != old_copy_object) {
4969                                 /*
4970                                  * The copy object changed while the top-level object
4971                                  * was unlocked, so take away write permission.
4972                                  */
4973                                 assert(!pmap_has_prot_policy(prot));
4974                                 prot &= ~VM_PROT_WRITE;
4975                         }
4976                 } else {
4977                         vm_object_lock(object);
4978                 }
4979
4980                 object_locks_dropped = FALSE;
4981         }
4982
4983         if (!need_copy &&
4984             !fault_info.no_copy_on_read &&
4985             m != VM_PAGE_NULL &&
4986             VM_PAGE_OBJECT(m) != object &&
4987             !VM_PAGE_OBJECT(m)->pager_trusted &&
4988             vm_protect_privileged_from_untrusted &&
4989             !((prot & VM_PROT_EXECUTE) &&
4990             VM_PAGE_OBJECT(m)->code_signed &&
4991             cs_process_enforcement(NULL)) &&
4992             current_proc_is_privileged()) {
4993                 /*
4994                  * We found the page we want in an "untrusted" VM object
4995                  * down the shadow chain.  Since the target is "privileged"
4996                  * we want to perform a copy-on-read of that page, so that the
4997                  * mapped object gets a stable copy and does not have to
4998                  * rely on the "untrusted" object to provide the same
4999                  * contents if the page gets reclaimed and has to be paged
5000                  * in again later on.
5001                  *
5002                  * Special case: if the mapping is executable and the untrusted
5003                  * object is code-signed and the process is "cs_enforced", we
5004                  * do not copy-on-read because that would break code-signing
5005                  * enforcement expectations (an executable page must belong
5006                  * to a code-signed object) and we can rely on code-signing
5007                  * to re-validate the page if it gets evicted and paged back in.
5008                  */
5009 //              printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5010                 vm_copied_on_read++;
5011                 need_copy_on_read = TRUE;
5012                 need_copy = TRUE;
5013         } else {
5014                 need_copy_on_read = FALSE;
5015         }
5016
5017         /*
5018          * If we want to wire down this page, but no longer have
5019          * adequate permissions, we must start all over.
5020          * If we decided to copy-on-read, we must also start all over.
5021          */
5022         if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
5023             need_copy_on_read) {
5024                 vm_map_unlock_read(map);
5025                 if (real_map != map) {
5026                         vm_map_unlock(real_map);
5027                 }
5028
5029                 if (m != VM_PAGE_NULL) {
5030                         assert(VM_PAGE_OBJECT(m) == m_object);
5031
5032                         RELEASE_PAGE(m);
5033
5034                         vm_fault_cleanup(m_object, top_page);
5035                 } else {
5036                         vm_fault_cleanup(object, top_page);
5037                 }
5038
5039                 vm_object_deallocate(object);
5040
5041                 goto RetryFault;
5042         }
5043         if (m != VM_PAGE_NULL) {
5044                 /*
5045                  * Put this page into the physical map.
5046                  * We had to do the unlock above because pmap_enter
5047                  * may cause other faults.  The page may be on
5048                  * the pageout queues.  If the pageout daemon comes
5049                  * across the page, it will remove it from the queues.
5050                  */
5051                 if (caller_pmap) {
5052                         kr = vm_fault_enter(m,
5053                             caller_pmap,
5054                             caller_pmap_addr,
5055                             prot,
5056                             caller_prot,
5057                             wired,
5058                             change_wiring,
5059                             wire_tag,
5060                             &fault_info,
5061                             NULL,
5062                             &type_of_fault);
5063                 } else {
5064                         kr = vm_fault_enter(m,
5065                             pmap,
5066                             vaddr,
5067                             prot,
5068                             caller_prot,
5069                             wired,
5070                             change_wiring,
5071                             wire_tag,
5072                             &fault_info,
5073                             NULL,
5074                             &type_of_fault);
5075                 }
5076                 assert(VM_PAGE_OBJECT(m) == m_object);
5077
5078                 {
5079                         int     event_code = 0;
5080
5081                         if (m_object->internal) {
5082                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
5083                         } else if (m_object->object_is_shared_cache) {
5084                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
5085                         } else {
5086                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
5087                         }
5088
5089                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
5090
5091                         DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
5092                 }
5093                 if (kr != KERN_SUCCESS) {
5094                         /* abort this page fault */
5095                         vm_map_unlock_read(map);
5096                         if (real_map != map) {
5097                                 vm_map_unlock(real_map);
5098                         }
5099                         PAGE_WAKEUP_DONE(m);
5100                         vm_fault_cleanup(m_object, top_page);
5101                         vm_object_deallocate(object);
5102                         goto done;
5103                 }
5104                 if (physpage_p != NULL) {
5105                         /* for vm_map_wire_and_extract() */
5106                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
5107                         if (prot & VM_PROT_WRITE) {
5108                                 vm_object_lock_assert_exclusive(m_object);
5109                                 m->vmp_dirty = TRUE;
5110                         }
5111                 }
5112         } else {
5113                 vm_map_entry_t          entry;
5114                 vm_map_offset_t         laddr;
5115                 vm_map_offset_t         ldelta, hdelta;
5116
5117                 /*
5118                  * do a pmap block mapping from the physical address
5119                  * in the object
5120                  */
5121
5122                 if (real_map != map) {
5123                         vm_map_unlock(real_map);
5124                 }
5125
5126                 if (original_map != map) {
5127                         vm_map_unlock_read(map);
5128                         vm_map_lock_read(original_map);
5129                         map = original_map;
5130                 }
5131                 real_map = map;
5132
5133                 laddr = vaddr;
5134                 hdelta = 0xFFFFF000;
5135                 ldelta = 0xFFFFF000;
5136
5137                 while (vm_map_lookup_entry(map, laddr, &entry)) {
5138                         if (ldelta > (laddr - entry->vme_start)) {
5139                                 ldelta = laddr - entry->vme_start;
5140                         }
5141                         if (hdelta > (entry->vme_end - laddr)) {
5142                                 hdelta = entry->vme_end - laddr;
5143                         }
5144                         if (entry->is_sub_map) {
5145                                 laddr = ((laddr - entry->vme_start)
5146                                     + VME_OFFSET(entry));
5147                                 vm_map_lock_read(VME_SUBMAP(entry));
5148
5149                                 if (map != real_map) {
5150                                         vm_map_unlock_read(map);
5151                                 }
5152                                 if (entry->use_pmap) {
5153                                         vm_map_unlock_read(real_map);
5154                                         real_map = VME_SUBMAP(entry);
5155                                 }
5156                                 map = VME_SUBMAP(entry);
5157                         } else {
5158                                 break;
5159                         }
5160                 }
5161
5162                 if (vm_map_lookup_entry(map, laddr, &entry) &&
5163                     (VME_OBJECT(entry) != NULL) &&
5164                     (VME_OBJECT(entry) == object)) {
5165                         int superpage;
5166
5167                         if (!object->pager_created &&
5168                             object->phys_contiguous &&
5169                             VME_OFFSET(entry) == 0 &&
5170                             (entry->vme_end - entry->vme_start == object->vo_size) &&
5171                             VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
5172                                 superpage = VM_MEM_SUPERPAGE;
5173                         } else {
5174                                 superpage = 0;
5175                         }
5176
5177                         if (superpage && physpage_p) {
5178                                 /* for vm_map_wire_and_extract() */
5179                                 *physpage_p = (ppnum_t)
5180                                     ((((vm_map_offset_t)
5181                                     object->vo_shadow_offset)
5182                                     + VME_OFFSET(entry)
5183                                     + (laddr - entry->vme_start))
5184                                     >> PAGE_SHIFT);
5185                         }
5186
5187                         if (caller_pmap) {
5188                                 /*
5189                                  * Set up a block mapped area
5190                                  */
5191                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5192                                 kr = pmap_map_block(caller_pmap,
5193                                     (addr64_t)(caller_pmap_addr - ldelta),
5194                                     (ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
5195                                     VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5196                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5197                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5198
5199                                 if (kr != KERN_SUCCESS) {
5200                                         goto cleanup;
5201                                 }
5202                         } else {
5203                                 /*
5204                                  * Set up a block mapped area
5205                                  */
5206                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5207                                 kr = pmap_map_block(real_map->pmap,
5208                                     (addr64_t)(vaddr - ldelta),
5209                                     (ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
5210                                     VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5211                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5212                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5213
5214                                 if (kr != KERN_SUCCESS) {
5215                                         goto cleanup;
5216                                 }
5217                         }
5218                 }
5219         }
5220
5221         /*
5222          * Success
5223          */
5224         kr = KERN_SUCCESS;
5225
5226         /*
5227          * TODO: could most of the done cases just use cleanup?
5228          */
5229 cleanup:
5230         /*
5231          * Unlock everything, and return
5232          */
5233         vm_map_unlock_read(map);
5234         if (real_map != map) {
5235                 vm_map_unlock(real_map);
5236         }
5237
5238         if (m != VM_PAGE_NULL) {
5239                 assert(VM_PAGE_OBJECT(m) == m_object);
5240
5241                 if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
5242                         vm_object_paging_begin(m_object);
5243
5244                         assert(written_on_object == VM_OBJECT_NULL);
5245                         written_on_object = m_object;
5246                         written_on_pager = m_object->pager;
5247                         written_on_offset = m_object->paging_offset + m->vmp_offset;
5248                 }
5249                 PAGE_WAKEUP_DONE(m);
5250
5251                 vm_fault_cleanup(m_object, top_page);
5252         } else {
5253                 vm_fault_cleanup(object, top_page);
5254         }
5255
5256         vm_object_deallocate(object);
5257
5258 #undef  RELEASE_PAGE
5259
5260 done:
5261         thread_interrupt_level(interruptible_state);
5262
5263         if (resilient_media_object != VM_OBJECT_NULL) {
5264                 assert(resilient_media_retry);
5265                 assert(resilient_media_offset != (vm_object_offset_t)-1);
5266                 /* release extra reference on failed object */
5267 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
5268                 vm_object_deallocate(resilient_media_object);
5269                 resilient_media_object = VM_OBJECT_NULL;
5270                 resilient_media_offset = (vm_object_offset_t)-1;
5271                 resilient_media_retry = FALSE;
5272         }
5273         assert(!resilient_media_retry);
5274
5275         /*
5276          * Only I/O throttle on faults which cause a pagein/swapin.
5277          */
5278         if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
5279                 throttle_lowpri_io(1);
5280         } else {
5281                 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
5282                         if ((throttle_delay = vm_page_throttled(TRUE))) {
5283                                 if (vm_debug_events) {
5284                                         if (type_of_fault == DBG_COMPRESSOR_FAULT) {
5285                                                 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5286                                         } else if (type_of_fault == DBG_COW_FAULT) {
5287                                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5288                                         } else {
5289                                                 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5290                                         }
5291                                 }
5292                                 delay(throttle_delay);
5293                         }
5294                 }
5295         }
5296
5297         if (written_on_object) {
5298                 vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
5299
5300                 vm_object_lock(written_on_object);
5301                 vm_object_paging_end(written_on_object);
5302                 vm_object_unlock(written_on_object);
5303
5304                 written_on_object = VM_OBJECT_NULL;
5305         }
5306
5307         if (rtfault) {
5308                 vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
5309         }
5310
5311         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5312             (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
5313             ((uint64_t)trace_vaddr >> 32),
5314             trace_vaddr,
5315             kr,
5316             type_of_fault,
5317             0);
5318
5319         return kr;
5320 }
5321
5322 /*
5323  *      vm_fault_wire:
5324  *
5325  *      Wire down a range of virtual addresses in a map.
5326  */
5327 kern_return_t
5328 vm_fault_wire(
5329         vm_map_t        map,
5330         vm_map_entry_t  entry,
5331         vm_prot_t       prot,
5332         vm_tag_t        wire_tag,
5333         pmap_t          pmap,
5334         vm_map_offset_t pmap_addr,
5335         ppnum_t         *physpage_p)
5336 {
5337         vm_map_offset_t va;
5338         vm_map_offset_t end_addr = entry->vme_end;
5339         kern_return_t   rc;
5340
5341         assert(entry->in_transition);
5342
5343         if ((VME_OBJECT(entry) != NULL) &&
5344             !entry->is_sub_map &&
5345             VME_OBJECT(entry)->phys_contiguous) {
5346                 return KERN_SUCCESS;
5347         }
5348
5349         /*
5350          *      Inform the physical mapping system that the
5351          *      range of addresses may not fault, so that
5352          *      page tables and such can be locked down as well.
5353          */
5354
5355         pmap_pageable(pmap, pmap_addr,
5356             pmap_addr + (end_addr - entry->vme_start), FALSE);
5357
5358         /*
5359          *      We simulate a fault to get the page and enter it
5360          *      in the physical map.
5361          */
5362
5363         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5364                 rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
5365                     pmap_addr + (va - entry->vme_start),
5366                     physpage_p);
5367                 if (rc != KERN_SUCCESS) {
5368                         rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
5369                             ((pmap == kernel_pmap)
5370                             ? THREAD_UNINT
5371                             : THREAD_ABORTSAFE),
5372                             pmap,
5373                             (pmap_addr +
5374                             (va - entry->vme_start)),
5375                             physpage_p);
5376                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
5377                 }
5378
5379                 if (rc != KERN_SUCCESS) {
5380                         struct vm_map_entry     tmp_entry = *entry;
5381
5382                         /* unwire wired pages */
5383                         tmp_entry.vme_end = va;
5384                         vm_fault_unwire(map,
5385                             &tmp_entry, FALSE, pmap, pmap_addr);
5386
5387                         return rc;
5388                 }
5389         }
5390         return KERN_SUCCESS;
5391 }
5392
5393 /*
5394  *      vm_fault_unwire:
5395  *
5396  *      Unwire a range of virtual addresses in a map.
5397  */
5398 void
5399 vm_fault_unwire(
5400         vm_map_t        map,
5401         vm_map_entry_t  entry,
5402         boolean_t       deallocate,
5403         pmap_t          pmap,
5404         vm_map_offset_t pmap_addr)
5405 {
5406         vm_map_offset_t va;
5407         vm_map_offset_t end_addr = entry->vme_end;
5408         vm_object_t             object;
5409         struct vm_object_fault_info fault_info = {};
5410         unsigned int    unwired_pages;
5411
5412         object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
5413
5414         /*
5415          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
5416          * do anything since such memory is wired by default.  So we don't have
5417          * anything to undo here.
5418          */
5419
5420         if (object != VM_OBJECT_NULL && object->phys_contiguous) {
5421                 return;
5422         }
5423
5424         fault_info.interruptible = THREAD_UNINT;
5425         fault_info.behavior = entry->behavior;
5426         fault_info.user_tag = VME_ALIAS(entry);
5427         if (entry->iokit_acct ||
5428             (!entry->is_sub_map && !entry->use_pmap)) {
5429                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
5430         }
5431         fault_info.lo_offset = VME_OFFSET(entry);
5432         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
5433         fault_info.no_cache = entry->no_cache;
5434         fault_info.stealth = TRUE;
5435
5436         unwired_pages = 0;
5437
5438         /*
5439          *      Since the pages are wired down, we must be able to
5440          *      get their mappings from the physical map system.
5441          */
5442
5443         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5444                 if (object == VM_OBJECT_NULL) {
5445                         if (pmap) {
5446                                 pmap_change_wiring(pmap,
5447                                     pmap_addr + (va - entry->vme_start), FALSE);
5448                         }
5449                         (void) vm_fault(map, va, VM_PROT_NONE,
5450                             TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
5451                 } else {
5452                         vm_prot_t       prot;
5453                         vm_page_t       result_page;
5454                         vm_page_t       top_page;
5455                         vm_object_t     result_object;
5456                         vm_fault_return_t result;
5457
5458                         /* cap cluster size at maximum UPL size */
5459                         upl_size_t cluster_size;
5460                         if (os_sub_overflow(end_addr, va, &cluster_size)) {
5461                                 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5462                         }
5463                         fault_info.cluster_size = cluster_size;
5464
5465                         do {
5466                                 prot = VM_PROT_NONE;
5467
5468                                 vm_object_lock(object);
5469                                 vm_object_paging_begin(object);
5470                                 result_page = VM_PAGE_NULL;
5471                                 result = vm_fault_page(
5472                                         object,
5473                                         (VME_OFFSET(entry) +
5474                                         (va - entry->vme_start)),
5475                                         VM_PROT_NONE, TRUE,
5476                                         FALSE, /* page not looked up */
5477                                         &prot, &result_page, &top_page,
5478                                         (int *)0,
5479                                         NULL, map->no_zero_fill,
5480                                         FALSE, &fault_info);
5481                         } while (result == VM_FAULT_RETRY);
5482
5483                         /*
5484                          * If this was a mapping to a file on a device that has been forcibly
5485                          * unmounted, then we won't get a page back from vm_fault_page().  Just
5486                          * move on to the next one in case the remaining pages are mapped from
5487                          * different objects.  During a forced unmount, the object is terminated
5488                          * so the alive flag will be false if this happens.  A forced unmount will
5489                          * will occur when an external disk is unplugged before the user does an
5490                          * eject, so we don't want to panic in that situation.
5491                          */
5492
5493                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive) {
5494                                 continue;
5495                         }
5496
5497                         if (result == VM_FAULT_MEMORY_ERROR &&
5498                             object == kernel_object) {
5499                                 /*
5500                                  * This must have been allocated with
5501                                  * KMA_KOBJECT and KMA_VAONLY and there's
5502                                  * no physical page at this offset.
5503                                  * We're done (no page to free).
5504                                  */
5505                                 assert(deallocate);
5506                                 continue;
5507                         }
5508
5509                         if (result != VM_FAULT_SUCCESS) {
5510                                 panic("vm_fault_unwire: failure");
5511                         }
5512
5513                         result_object = VM_PAGE_OBJECT(result_page);
5514
5515                         if (deallocate) {
5516                                 assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
5517                                     vm_page_fictitious_addr);
5518                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
5519                                 if (VM_PAGE_WIRED(result_page)) {
5520                                         unwired_pages++;
5521                                 }
5522                                 VM_PAGE_FREE(result_page);
5523                         } else {
5524                                 if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
5525                                         pmap_change_wiring(pmap,
5526                                             pmap_addr + (va - entry->vme_start), FALSE);
5527                                 }
5528
5529
5530                                 if (VM_PAGE_WIRED(result_page)) {
5531                                         vm_page_lockspin_queues();
5532                                         vm_page_unwire(result_page, TRUE);
5533                                         vm_page_unlock_queues();
5534                                         unwired_pages++;
5535                                 }
5536                                 if (entry->zero_wired_pages) {
5537                                         pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
5538                                         entry->zero_wired_pages = FALSE;
5539                                 }
5540
5541                                 PAGE_WAKEUP_DONE(result_page);
5542                         }
5543                         vm_fault_cleanup(result_object, top_page);
5544                 }
5545         }
5546
5547         /*
5548          *      Inform the physical mapping system that the range
5549          *      of addresses may fault, so that page tables and
5550          *      such may be unwired themselves.
5551          */
5552
5553         pmap_pageable(pmap, pmap_addr,
5554             pmap_addr + (end_addr - entry->vme_start), TRUE);
5555
5556         if (kernel_object == object) {
5557                 vm_tag_update_size(fault_info.user_tag, -ptoa_64(unwired_pages));
5558         }
5559 }
5560
5561 /*
5562  *      vm_fault_wire_fast:
5563  *
5564  *      Handle common case of a wire down page fault at the given address.
5565  *      If successful, the page is inserted into the associated physical map.
5566  *      The map entry is passed in to avoid the overhead of a map lookup.
5567  *
5568  *      NOTE: the given address should be truncated to the
5569  *      proper page address.
5570  *
5571  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
5572  *      a standard error specifying why the fault is fatal is returned.
5573  *
5574  *      The map in question must be referenced, and remains so.
5575  *      Caller has a read lock on the map.
5576  *
5577  *      This is a stripped version of vm_fault() for wiring pages.  Anything
5578  *      other than the common case will return KERN_FAILURE, and the caller
5579  *      is expected to call vm_fault().
5580  */
5581 static kern_return_t
5582 vm_fault_wire_fast(
5583         __unused vm_map_t       map,
5584         vm_map_offset_t va,
5585         __unused vm_prot_t       caller_prot,
5586         vm_tag_t        wire_tag,
5587         vm_map_entry_t  entry,
5588         pmap_t          pmap,
5589         vm_map_offset_t pmap_addr,
5590         ppnum_t         *physpage_p)
5591 {
5592         vm_object_t             object;
5593         vm_object_offset_t      offset;
5594         vm_page_t               m;
5595         vm_prot_t               prot;
5596         thread_t                thread = current_thread();
5597         int                     type_of_fault;
5598         kern_return_t           kr;
5599         struct vm_object_fault_info fault_info = {};
5600
5601         VM_STAT_INCR(faults);
5602
5603         if (thread != THREAD_NULL && thread->task != TASK_NULL) {
5604                 thread->task->faults++;
5605         }
5606
5607 /*
5608  *      Recovery actions
5609  */
5610
5611 #undef  RELEASE_PAGE
5612 #define RELEASE_PAGE(m) {                               \
5613         PAGE_WAKEUP_DONE(m);                            \
5614         vm_page_lockspin_queues();                      \
5615         vm_page_unwire(m, TRUE);                        \
5616         vm_page_unlock_queues();                        \
5617 }
5618
5619
5620 #undef  UNLOCK_THINGS
5621 #define UNLOCK_THINGS   {                               \
5622         vm_object_paging_end(object);                      \
5623         vm_object_unlock(object);                          \
5624 }
5625
5626 #undef  UNLOCK_AND_DEALLOCATE
5627 #define UNLOCK_AND_DEALLOCATE   {                       \
5628         UNLOCK_THINGS;                                  \
5629         vm_object_deallocate(object);                   \
5630 }
5631 /*
5632  *      Give up and have caller do things the hard way.
5633  */
5634
5635 #define GIVE_UP {                                       \
5636         UNLOCK_AND_DEALLOCATE;                          \
5637         return(KERN_FAILURE);                           \
5638 }
5639
5640
5641         /*
5642          *      If this entry is not directly to a vm_object, bail out.
5643          */
5644         if (entry->is_sub_map) {
5645                 assert(physpage_p == NULL);
5646                 return KERN_FAILURE;
5647         }
5648
5649         /*
5650          *      Find the backing store object and offset into it.
5651          */
5652
5653         object = VME_OBJECT(entry);
5654         offset = (va - entry->vme_start) + VME_OFFSET(entry);
5655         prot = entry->protection;
5656
5657         /*
5658          *      Make a reference to this object to prevent its
5659          *      disposal while we are messing with it.
5660          */
5661
5662         vm_object_lock(object);
5663         vm_object_reference_locked(object);
5664         vm_object_paging_begin(object);
5665
5666         /*
5667          *      INVARIANTS (through entire routine):
5668          *
5669          *      1)      At all times, we must either have the object
5670          *              lock or a busy page in some object to prevent
5671          *              some other thread from trying to bring in
5672          *              the same page.
5673          *
5674          *      2)      Once we have a busy page, we must remove it from
5675          *              the pageout queues, so that the pageout daemon
5676          *              will not grab it away.
5677          *
5678          */
5679
5680         /*
5681          *      Look for page in top-level object.  If it's not there or
5682          *      there's something going on, give up.
5683          */
5684         m = vm_page_lookup(object, offset);
5685         if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
5686             (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) {
5687                 GIVE_UP;
5688         }
5689         if (m->vmp_fictitious &&
5690             VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
5691                 /*
5692                  * Guard pages are fictitious pages and are never
5693                  * entered into a pmap, so let's say it's been wired...
5694                  */
5695                 kr = KERN_SUCCESS;
5696                 goto done;
5697         }
5698
5699         /*
5700          *      Wire the page down now.  All bail outs beyond this
5701          *      point must unwire the page.
5702          */
5703
5704         vm_page_lockspin_queues();
5705         vm_page_wire(m, wire_tag, TRUE);
5706         vm_page_unlock_queues();
5707
5708         /*
5709          *      Mark page busy for other threads.
5710          */
5711         assert(!m->vmp_busy);
5712         m->vmp_busy = TRUE;
5713         assert(!m->vmp_absent);
5714
5715         /*
5716          *      Give up if the page is being written and there's a copy object
5717          */
5718         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
5719                 RELEASE_PAGE(m);
5720                 GIVE_UP;
5721         }
5722
5723         fault_info.user_tag = VME_ALIAS(entry);
5724         fault_info.pmap_options = 0;
5725         if (entry->iokit_acct ||
5726             (!entry->is_sub_map && !entry->use_pmap)) {
5727                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
5728         }
5729
5730         /*
5731          *      Put this page into the physical map.
5732          */
5733         type_of_fault = DBG_CACHE_HIT_FAULT;
5734         kr = vm_fault_enter(m,
5735             pmap,
5736             pmap_addr,
5737             prot,
5738             prot,
5739             TRUE,                  /* wired */
5740             FALSE,                 /* change_wiring */
5741             wire_tag,
5742             &fault_info,
5743             NULL,
5744             &type_of_fault);
5745         if (kr != KERN_SUCCESS) {
5746                 RELEASE_PAGE(m);
5747                 GIVE_UP;
5748         }
5749
5750 done:
5751         /*
5752          *      Unlock everything, and return
5753          */
5754
5755         if (physpage_p) {
5756                 /* for vm_map_wire_and_extract() */
5757                 if (kr == KERN_SUCCESS) {
5758                         assert(object == VM_PAGE_OBJECT(m));
5759                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
5760                         if (prot & VM_PROT_WRITE) {
5761                                 vm_object_lock_assert_exclusive(object);
5762                                 m->vmp_dirty = TRUE;
5763                         }
5764                 } else {
5765                         *physpage_p = 0;
5766                 }
5767         }
5768
5769         PAGE_WAKEUP_DONE(m);
5770         UNLOCK_AND_DEALLOCATE;
5771
5772         return kr;
5773 }
5774
5775 /*
5776  *      Routine:        vm_fault_copy_cleanup
5777  *      Purpose:
5778  *              Release a page used by vm_fault_copy.
5779  */
5780
5781 static void
5782 vm_fault_copy_cleanup(
5783         vm_page_t       page,
5784         vm_page_t       top_page)
5785 {
5786         vm_object_t     object = VM_PAGE_OBJECT(page);
5787
5788         vm_object_lock(object);
5789         PAGE_WAKEUP_DONE(page);
5790         if (!VM_PAGE_PAGEABLE(page)) {
5791                 vm_page_lockspin_queues();
5792                 if (!VM_PAGE_PAGEABLE(page)) {
5793                         vm_page_activate(page);
5794                 }
5795                 vm_page_unlock_queues();
5796         }
5797         vm_fault_cleanup(object, top_page);
5798 }
5799
5800 static void
5801 vm_fault_copy_dst_cleanup(
5802         vm_page_t       page)
5803 {
5804         vm_object_t     object;
5805
5806         if (page != VM_PAGE_NULL) {
5807                 object = VM_PAGE_OBJECT(page);
5808                 vm_object_lock(object);
5809                 vm_page_lockspin_queues();
5810                 vm_page_unwire(page, TRUE);
5811                 vm_page_unlock_queues();
5812                 vm_object_paging_end(object);
5813                 vm_object_unlock(object);
5814         }
5815 }
5816
5817 /*
5818  *      Routine:        vm_fault_copy
5819  *
5820  *      Purpose:
5821  *              Copy pages from one virtual memory object to another --
5822  *              neither the source nor destination pages need be resident.
5823  *
5824  *              Before actually copying a page, the version associated with
5825  *              the destination address map wil be verified.
5826  *
5827  *      In/out conditions:
5828  *              The caller must hold a reference, but not a lock, to
5829  *              each of the source and destination objects and to the
5830  *              destination map.
5831  *
5832  *      Results:
5833  *              Returns KERN_SUCCESS if no errors were encountered in
5834  *              reading or writing the data.  Returns KERN_INTERRUPTED if
5835  *              the operation was interrupted (only possible if the
5836  *              "interruptible" argument is asserted).  Other return values
5837  *              indicate a permanent error in copying the data.
5838  *
5839  *              The actual amount of data copied will be returned in the
5840  *              "copy_size" argument.  In the event that the destination map
5841  *              verification failed, this amount may be less than the amount
5842  *              requested.
5843  */
5844 kern_return_t
5845 vm_fault_copy(
5846         vm_object_t             src_object,
5847         vm_object_offset_t      src_offset,
5848         vm_map_size_t           *copy_size,             /* INOUT */
5849         vm_object_t             dst_object,
5850         vm_object_offset_t      dst_offset,
5851         vm_map_t                dst_map,
5852         vm_map_version_t         *dst_version,
5853         int                     interruptible)
5854 {
5855         vm_page_t               result_page;
5856
5857         vm_page_t               src_page;
5858         vm_page_t               src_top_page;
5859         vm_prot_t               src_prot;
5860
5861         vm_page_t               dst_page;
5862         vm_page_t               dst_top_page;
5863         vm_prot_t               dst_prot;
5864
5865         vm_map_size_t           amount_left;
5866         vm_object_t             old_copy_object;
5867         vm_object_t             result_page_object = NULL;
5868         kern_return_t           error = 0;
5869         vm_fault_return_t       result;
5870
5871         vm_map_size_t           part_size;
5872         struct vm_object_fault_info fault_info_src = {};
5873         struct vm_object_fault_info fault_info_dst = {};
5874
5875         /*
5876          * In order not to confuse the clustered pageins, align
5877          * the different offsets on a page boundary.
5878          */
5879
5880 #define RETURN(x)                                       \
5881         MACRO_BEGIN                                     \
5882         *copy_size -= amount_left;                      \
5883         MACRO_RETURN(x);                                \
5884         MACRO_END
5885
5886         amount_left = *copy_size;
5887
5888         fault_info_src.interruptible = interruptible;
5889         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5890         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5891         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5892         fault_info_src.stealth = TRUE;
5893
5894         fault_info_dst.interruptible = interruptible;
5895         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5896         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5897         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5898         fault_info_dst.stealth = TRUE;
5899
5900         do { /* while (amount_left > 0) */
5901                 /*
5902                  * There may be a deadlock if both source and destination
5903                  * pages are the same. To avoid this deadlock, the copy must
5904                  * start by getting the destination page in order to apply
5905                  * COW semantics if any.
5906                  */
5907
5908 RetryDestinationFault:;
5909
5910                 dst_prot = VM_PROT_WRITE | VM_PROT_READ;
5911
5912                 vm_object_lock(dst_object);
5913                 vm_object_paging_begin(dst_object);
5914
5915                 /* cap cluster size at maximum UPL size */
5916                 upl_size_t cluster_size;
5917                 if (os_convert_overflow(amount_left, &cluster_size)) {
5918                         cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5919                 }
5920                 fault_info_dst.cluster_size = cluster_size;
5921
5922                 dst_page = VM_PAGE_NULL;
5923                 result = vm_fault_page(dst_object,
5924                     vm_object_trunc_page(dst_offset),
5925                     VM_PROT_WRITE | VM_PROT_READ,
5926                     FALSE,
5927                     FALSE,                    /* page not looked up */
5928                     &dst_prot, &dst_page, &dst_top_page,
5929                     (int *)0,
5930                     &error,
5931                     dst_map->no_zero_fill,
5932                     FALSE, &fault_info_dst);
5933                 switch (result) {
5934                 case VM_FAULT_SUCCESS:
5935                         break;
5936                 case VM_FAULT_RETRY:
5937                         goto RetryDestinationFault;
5938                 case VM_FAULT_MEMORY_SHORTAGE:
5939                         if (vm_page_wait(interruptible)) {
5940                                 goto RetryDestinationFault;
5941                         }
5942                 /* fall thru */
5943                 case VM_FAULT_INTERRUPTED:
5944                         RETURN(MACH_SEND_INTERRUPTED);
5945                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5946                         /* success but no VM page: fail the copy */
5947                         vm_object_paging_end(dst_object);
5948                         vm_object_unlock(dst_object);
5949                 /*FALLTHROUGH*/
5950                 case VM_FAULT_MEMORY_ERROR:
5951                         if (error) {
5952                                 return error;
5953                         } else {
5954                                 return KERN_MEMORY_ERROR;
5955                         }
5956                 default:
5957                         panic("vm_fault_copy: unexpected error 0x%x from "
5958                             "vm_fault_page()\n", result);
5959                 }
5960                 assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5961
5962                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
5963                 old_copy_object = dst_object->copy;
5964
5965                 /*
5966                  * There exists the possiblity that the source and
5967                  * destination page are the same.  But we can't
5968                  * easily determine that now.  If they are the
5969                  * same, the call to vm_fault_page() for the
5970                  * destination page will deadlock.  To prevent this we
5971                  * wire the page so we can drop busy without having
5972                  * the page daemon steal the page.  We clean up the
5973                  * top page  but keep the paging reference on the object
5974                  * holding the dest page so it doesn't go away.
5975                  */
5976
5977                 vm_page_lockspin_queues();
5978                 vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
5979                 vm_page_unlock_queues();
5980                 PAGE_WAKEUP_DONE(dst_page);
5981                 vm_object_unlock(dst_object);
5982
5983                 if (dst_top_page != VM_PAGE_NULL) {
5984                         vm_object_lock(dst_object);
5985                         VM_PAGE_FREE(dst_top_page);
5986                         vm_object_paging_end(dst_object);
5987                         vm_object_unlock(dst_object);
5988                 }
5989
5990 RetrySourceFault:;
5991
5992                 if (src_object == VM_OBJECT_NULL) {
5993                         /*
5994                          *      No source object.  We will just
5995                          *      zero-fill the page in dst_object.
5996                          */
5997                         src_page = VM_PAGE_NULL;
5998                         result_page = VM_PAGE_NULL;
5999                 } else {
6000                         vm_object_lock(src_object);
6001                         src_page = vm_page_lookup(src_object,
6002                             vm_object_trunc_page(src_offset));
6003                         if (src_page == dst_page) {
6004                                 src_prot = dst_prot;
6005                                 result_page = VM_PAGE_NULL;
6006                         } else {
6007                                 src_prot = VM_PROT_READ;
6008                                 vm_object_paging_begin(src_object);
6009
6010                                 /* cap cluster size at maximum UPL size */
6011                                 if (os_convert_overflow(amount_left, &cluster_size)) {
6012                                         cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6013                                 }
6014                                 fault_info_src.cluster_size = cluster_size;
6015
6016                                 result_page = VM_PAGE_NULL;
6017                                 result = vm_fault_page(
6018                                         src_object,
6019                                         vm_object_trunc_page(src_offset),
6020                                         VM_PROT_READ, FALSE,
6021                                         FALSE, /* page not looked up */
6022                                         &src_prot,
6023                                         &result_page, &src_top_page,
6024                                         (int *)0, &error, FALSE,
6025                                         FALSE, &fault_info_src);
6026
6027                                 switch (result) {
6028                                 case VM_FAULT_SUCCESS:
6029                                         break;
6030                                 case VM_FAULT_RETRY:
6031                                         goto RetrySourceFault;
6032                                 case VM_FAULT_MEMORY_SHORTAGE:
6033                                         if (vm_page_wait(interruptible)) {
6034                                                 goto RetrySourceFault;
6035                                         }
6036                                 /* fall thru */
6037                                 case VM_FAULT_INTERRUPTED:
6038                                         vm_fault_copy_dst_cleanup(dst_page);
6039                                         RETURN(MACH_SEND_INTERRUPTED);
6040                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
6041                                         /* success but no VM page: fail */
6042                                         vm_object_paging_end(src_object);
6043                                         vm_object_unlock(src_object);
6044                                 /*FALLTHROUGH*/
6045                                 case VM_FAULT_MEMORY_ERROR:
6046                                         vm_fault_copy_dst_cleanup(dst_page);
6047                                         if (error) {
6048                                                 return error;
6049                                         } else {
6050                                                 return KERN_MEMORY_ERROR;
6051                                         }
6052                                 default:
6053                                         panic("vm_fault_copy(2): unexpected "
6054                                             "error 0x%x from "
6055                                             "vm_fault_page()\n", result);
6056                                 }
6057
6058                                 result_page_object = VM_PAGE_OBJECT(result_page);
6059                                 assert((src_top_page == VM_PAGE_NULL) ==
6060                                     (result_page_object == src_object));
6061                         }
6062                         assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
6063                         vm_object_unlock(result_page_object);
6064                 }
6065
6066                 vm_map_lock_read(dst_map);
6067
6068                 if (!vm_map_verify(dst_map, dst_version)) {
6069                         vm_map_unlock_read(dst_map);
6070                         if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6071                                 vm_fault_copy_cleanup(result_page, src_top_page);
6072                         }
6073                         vm_fault_copy_dst_cleanup(dst_page);
6074                         break;
6075                 }
6076                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
6077
6078                 vm_object_lock(dst_object);
6079
6080                 if (dst_object->copy != old_copy_object) {
6081                         vm_object_unlock(dst_object);
6082                         vm_map_unlock_read(dst_map);
6083                         if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6084                                 vm_fault_copy_cleanup(result_page, src_top_page);
6085                         }
6086                         vm_fault_copy_dst_cleanup(dst_page);
6087                         break;
6088                 }
6089                 vm_object_unlock(dst_object);
6090
6091                 /*
6092                  *      Copy the page, and note that it is dirty
6093                  *      immediately.
6094                  */
6095
6096                 if (!page_aligned(src_offset) ||
6097                     !page_aligned(dst_offset) ||
6098                     !page_aligned(amount_left)) {
6099                         vm_object_offset_t      src_po,
6100                             dst_po;
6101
6102                         src_po = src_offset - vm_object_trunc_page(src_offset);
6103                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
6104
6105                         if (dst_po > src_po) {
6106                                 part_size = PAGE_SIZE - dst_po;
6107                         } else {
6108                                 part_size = PAGE_SIZE - src_po;
6109                         }
6110                         if (part_size > (amount_left)) {
6111                                 part_size = amount_left;
6112                         }
6113
6114                         if (result_page == VM_PAGE_NULL) {
6115                                 assert((vm_offset_t) dst_po == dst_po);
6116                                 assert((vm_size_t) part_size == part_size);
6117                                 vm_page_part_zero_fill(dst_page,
6118                                     (vm_offset_t) dst_po,
6119                                     (vm_size_t) part_size);
6120                         } else {
6121                                 assert((vm_offset_t) src_po == src_po);
6122                                 assert((vm_offset_t) dst_po == dst_po);
6123                                 assert((vm_size_t) part_size == part_size);
6124                                 vm_page_part_copy(result_page,
6125                                     (vm_offset_t) src_po,
6126                                     dst_page,
6127                                     (vm_offset_t) dst_po,
6128                                     (vm_size_t)part_size);
6129                                 if (!dst_page->vmp_dirty) {
6130                                         vm_object_lock(dst_object);
6131                                         SET_PAGE_DIRTY(dst_page, TRUE);
6132                                         vm_object_unlock(dst_object);
6133                                 }
6134                         }
6135                 } else {
6136                         part_size = PAGE_SIZE;
6137
6138                         if (result_page == VM_PAGE_NULL) {
6139                                 vm_page_zero_fill(dst_page);
6140                         } else {
6141                                 vm_object_lock(result_page_object);
6142                                 vm_page_copy(result_page, dst_page);
6143                                 vm_object_unlock(result_page_object);
6144
6145                                 if (!dst_page->vmp_dirty) {
6146                                         vm_object_lock(dst_object);
6147                                         SET_PAGE_DIRTY(dst_page, TRUE);
6148                                         vm_object_unlock(dst_object);
6149                                 }
6150                         }
6151                 }
6152
6153                 /*
6154                  *      Unlock everything, and return
6155                  */
6156
6157                 vm_map_unlock_read(dst_map);
6158
6159                 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6160                         vm_fault_copy_cleanup(result_page, src_top_page);
6161                 }
6162                 vm_fault_copy_dst_cleanup(dst_page);
6163
6164                 amount_left -= part_size;
6165                 src_offset += part_size;
6166                 dst_offset += part_size;
6167         } while (amount_left > 0);
6168
6169         RETURN(KERN_SUCCESS);
6170 #undef  RETURN
6171
6172         /*NOTREACHED*/
6173 }
6174
6175 #if     VM_FAULT_CLASSIFY
6176 /*
6177  *      Temporary statistics gathering support.
6178  */
6179
6180 /*
6181  *      Statistics arrays:
6182  */
6183 #define VM_FAULT_TYPES_MAX      5
6184 #define VM_FAULT_LEVEL_MAX      8
6185
6186 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
6187
6188 #define VM_FAULT_TYPE_ZERO_FILL 0
6189 #define VM_FAULT_TYPE_MAP_IN    1
6190 #define VM_FAULT_TYPE_PAGER     2
6191 #define VM_FAULT_TYPE_COPY      3
6192 #define VM_FAULT_TYPE_OTHER     4
6193
6194
6195 void
6196 vm_fault_classify(vm_object_t           object,
6197     vm_object_offset_t    offset,
6198     vm_prot_t             fault_type)
6199 {
6200         int             type, level = 0;
6201         vm_page_t       m;
6202
6203         while (TRUE) {
6204                 m = vm_page_lookup(object, offset);
6205                 if (m != VM_PAGE_NULL) {
6206                         if (m->vmp_busy || m->vmp_error || m->vmp_restart || m->vmp_absent) {
6207                                 type = VM_FAULT_TYPE_OTHER;
6208                                 break;
6209                         }
6210                         if (((fault_type & VM_PROT_WRITE) == 0) ||
6211                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
6212                                 type = VM_FAULT_TYPE_MAP_IN;
6213                                 break;
6214                         }
6215                         type = VM_FAULT_TYPE_COPY;
6216                         break;
6217                 } else {
6218                         if (object->pager_created) {
6219                                 type = VM_FAULT_TYPE_PAGER;
6220                                 break;
6221                         }
6222                         if (object->shadow == VM_OBJECT_NULL) {
6223                                 type = VM_FAULT_TYPE_ZERO_FILL;
6224                                 break;
6225                         }
6226
6227                         offset += object->vo_shadow_offset;
6228                         object = object->shadow;
6229                         level++;
6230                         continue;
6231                 }
6232         }
6233
6234         if (level > VM_FAULT_LEVEL_MAX) {
6235                 level = VM_FAULT_LEVEL_MAX;
6236         }
6237
6238         vm_fault_stats[type][level] += 1;
6239
6240         return;
6241 }
6242
6243 /* cleanup routine to call from debugger */
6244
6245 void
6246 vm_fault_classify_init(void)
6247 {
6248         int type, level;
6249
6250         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
6251                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
6252                         vm_fault_stats[type][level] = 0;
6253                 }
6254         }
6255
6256         return;
6257 }
6258 #endif  /* VM_FAULT_CLASSIFY */
6259
6260 vm_offset_t
6261 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
6262 {
6263         vm_map_entry_t  entry;
6264         vm_object_t     object;
6265         vm_offset_t     object_offset;
6266         vm_page_t       m;
6267         int             compressor_external_state, compressed_count_delta;
6268         int             compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
6269         int             my_fault_type = VM_PROT_READ;
6270         kern_return_t   kr;
6271
6272         if (not_in_kdp) {
6273                 panic("kdp_lightweight_fault called from outside of debugger context");
6274         }
6275
6276         assert(map != VM_MAP_NULL);
6277
6278         assert((cur_target_addr & PAGE_MASK) == 0);
6279         if ((cur_target_addr & PAGE_MASK) != 0) {
6280                 return 0;
6281         }
6282
6283         if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
6284                 return 0;
6285         }
6286
6287         if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
6288                 return 0;
6289         }
6290
6291         if (entry->is_sub_map) {
6292                 return 0;
6293         }
6294
6295         object = VME_OBJECT(entry);
6296         if (object == VM_OBJECT_NULL) {
6297                 return 0;
6298         }
6299
6300         object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
6301
6302         while (TRUE) {
6303                 if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
6304                         return 0;
6305                 }
6306
6307                 if (object->pager_created && (object->paging_in_progress ||
6308                     object->activity_in_progress)) {
6309                         return 0;
6310                 }
6311
6312                 m = kdp_vm_page_lookup(object, object_offset);
6313
6314                 if (m != VM_PAGE_NULL) {
6315                         if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
6316                                 return 0;
6317                         }
6318
6319                         if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || m->vmp_error || m->vmp_cleaning ||
6320                             m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
6321                                 return 0;
6322                         }
6323
6324                         assert(!m->vmp_private);
6325                         if (m->vmp_private) {
6326                                 return 0;
6327                         }
6328
6329                         assert(!m->vmp_fictitious);
6330                         if (m->vmp_fictitious) {
6331                                 return 0;
6332                         }
6333
6334                         assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
6335                         if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6336                                 return 0;
6337                         }
6338
6339                         return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
6340                 }
6341
6342                 compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
6343
6344                 if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
6345                         if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
6346                                 kr = vm_compressor_pager_get(object->pager, (object_offset + object->paging_offset),
6347                                     kdp_compressor_decompressed_page_ppnum, &my_fault_type,
6348                                     compressor_flags, &compressed_count_delta);
6349                                 if (kr == KERN_SUCCESS) {
6350                                         return kdp_compressor_decompressed_page_paddr;
6351                                 } else {
6352                                         return 0;
6353                                 }
6354                         }
6355                 }
6356
6357                 if (object->shadow == VM_OBJECT_NULL) {
6358                         return 0;
6359                 }
6360
6361                 object_offset += object->vo_shadow_offset;
6362                 object = object->shadow;
6363         }
6364 }
6365
6366 /*
6367  * vm_page_validate_cs_fast():
6368  * Performs a few quick checks to determine if the page's code signature
6369  * really needs to be fully validated.  It could:
6370  *      1. have been modified (i.e. automatically tainted),
6371  *      2. have already been validated,
6372  *      3. have already been found to be tainted,
6373  *      4. no longer have a backing store.
6374  * Returns FALSE if the page needs to be fully validated.
6375  */
6376 static boolean_t
6377 vm_page_validate_cs_fast(
6378         vm_page_t       page)
6379 {
6380         vm_object_t     object;
6381
6382         object = VM_PAGE_OBJECT(page);
6383         vm_object_lock_assert_held(object);
6384
6385         if (page->vmp_wpmapped && !page->vmp_cs_tainted) {
6386                 /*
6387                  * This page was mapped for "write" access sometime in the
6388                  * past and could still be modifiable in the future.
6389                  * Consider it tainted.
6390                  * [ If the page was already found to be "tainted", no
6391                  * need to re-validate. ]
6392                  */
6393                 vm_object_lock_assert_exclusive(object);
6394                 page->vmp_cs_validated = TRUE;
6395                 page->vmp_cs_tainted = TRUE;
6396                 if (cs_debug) {
6397                         printf("CODESIGNING: %s: "
6398                             "page %p obj %p off 0x%llx "
6399                             "was modified\n",
6400                             __FUNCTION__,
6401                             page, object, page->vmp_offset);
6402                 }
6403                 vm_cs_validated_dirtied++;
6404         }
6405
6406         if (page->vmp_cs_validated || page->vmp_cs_tainted) {
6407                 return TRUE;
6408         }
6409         vm_object_lock_assert_exclusive(object);
6410
6411 #if CHECK_CS_VALIDATION_BITMAP
6412         kern_return_t kr;
6413
6414         kr = vnode_pager_cs_check_validation_bitmap(
6415                 object->pager,
6416                 page->vmp_offset + object->paging_offset,
6417                 CS_BITMAP_CHECK);
6418         if (kr == KERN_SUCCESS) {
6419                 page->vmp_cs_validated = TRUE;
6420                 page->vmp_cs_tainted = FALSE;
6421                 vm_cs_bitmap_validated++;
6422                 return TRUE;
6423         }
6424 #endif /* CHECK_CS_VALIDATION_BITMAP */
6425
6426         if (!object->alive || object->terminating || object->pager == NULL) {
6427                 /*
6428                  * The object is terminating and we don't have its pager
6429                  * so we can't validate the data...
6430                  */
6431                 return TRUE;
6432         }
6433
6434         /* we need to really validate this page */
6435         vm_object_lock_assert_exclusive(object);
6436         return FALSE;
6437 }
6438
6439 void
6440 vm_page_validate_cs_mapped_slow(
6441         vm_page_t       page,
6442         const void      *kaddr)
6443 {
6444         vm_object_t             object;
6445         memory_object_offset_t  mo_offset;
6446         memory_object_t         pager;
6447         struct vnode            *vnode;
6448         boolean_t               validated;
6449         unsigned                tainted;
6450
6451         assert(page->vmp_busy);
6452         object = VM_PAGE_OBJECT(page);
6453         vm_object_lock_assert_exclusive(object);
6454
6455         vm_cs_validates++;
6456
6457         /*
6458          * Since we get here to validate a page that was brought in by
6459          * the pager, we know that this pager is all setup and ready
6460          * by now.
6461          */
6462         assert(object->code_signed);
6463         assert(!object->internal);
6464         assert(object->pager != NULL);
6465         assert(object->pager_ready);
6466
6467         pager = object->pager;
6468         assert(object->paging_in_progress);
6469         vnode = vnode_pager_lookup_vnode(pager);
6470         mo_offset = page->vmp_offset + object->paging_offset;
6471
6472         /* verify the SHA1 hash for this page */
6473         tainted = 0;
6474         validated = cs_validate_range(vnode,
6475             pager,
6476             mo_offset,
6477             (const void *)((const char *)kaddr),
6478             PAGE_SIZE_64,
6479             &tainted);
6480
6481         if (tainted & CS_VALIDATE_TAINTED) {
6482                 page->vmp_cs_tainted = TRUE;
6483         }
6484         if (tainted & CS_VALIDATE_NX) {
6485                 page->vmp_cs_nx = TRUE;
6486         }
6487         if (validated) {
6488                 page->vmp_cs_validated = TRUE;
6489         }
6490
6491 #if CHECK_CS_VALIDATION_BITMAP
6492         if (page->vmp_cs_validated && !page->vmp_cs_tainted) {
6493                 vnode_pager_cs_check_validation_bitmap(object->pager,
6494                     mo_offset,
6495                     CS_BITMAP_SET);
6496         }
6497 #endif /* CHECK_CS_VALIDATION_BITMAP */
6498 }
6499
6500 void
6501 vm_page_validate_cs_mapped(
6502         vm_page_t       page,
6503         const void      *kaddr)
6504 {
6505         if (!vm_page_validate_cs_fast(page)) {
6506                 vm_page_validate_cs_mapped_slow(page, kaddr);
6507         }
6508 }
6509
6510 void
6511 vm_page_validate_cs(
6512         vm_page_t       page)
6513 {
6514         vm_object_t             object;
6515         vm_object_offset_t      offset;
6516         vm_map_offset_t         koffset;
6517         vm_map_size_t           ksize;
6518         vm_offset_t             kaddr;
6519         kern_return_t           kr;
6520         boolean_t               busy_page;
6521         boolean_t               need_unmap;
6522
6523         object = VM_PAGE_OBJECT(page);
6524         vm_object_lock_assert_held(object);
6525
6526         if (vm_page_validate_cs_fast(page)) {
6527                 return;
6528         }
6529         vm_object_lock_assert_exclusive(object);
6530
6531         assert(object->code_signed);
6532         offset = page->vmp_offset;
6533
6534         busy_page = page->vmp_busy;
6535         if (!busy_page) {
6536                 /* keep page busy while we map (and unlock) the VM object */
6537                 page->vmp_busy = TRUE;
6538         }
6539
6540         /*
6541          * Take a paging reference on the VM object
6542          * to protect it from collapse or bypass,
6543          * and keep it from disappearing too.
6544          */
6545         vm_object_paging_begin(object);
6546
6547         /* map the page in the kernel address space */
6548         ksize = PAGE_SIZE_64;
6549         koffset = 0;
6550         need_unmap = FALSE;
6551         kr = vm_paging_map_object(page,
6552             object,
6553             offset,
6554             VM_PROT_READ,
6555             FALSE,                       /* can't unlock object ! */
6556             &ksize,
6557             &koffset,
6558             &need_unmap);
6559         if (kr != KERN_SUCCESS) {
6560                 panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
6561         }
6562         kaddr = CAST_DOWN(vm_offset_t, koffset);
6563
6564         /* validate the mapped page */
6565         vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
6566
6567         assert(page->vmp_busy);
6568         assert(object == VM_PAGE_OBJECT(page));
6569         vm_object_lock_assert_exclusive(object);
6570
6571         if (!busy_page) {
6572                 PAGE_WAKEUP_DONE(page);
6573         }
6574         if (need_unmap) {
6575                 /* unmap the map from the kernel address space */
6576                 vm_paging_unmap_object(object, koffset, koffset + ksize);
6577                 koffset = 0;
6578                 ksize = 0;
6579                 kaddr = 0;
6580         }
6581         vm_object_paging_end(object);
6582 }
6583
6584 void
6585 vm_page_validate_cs_mapped_chunk(
6586         vm_page_t       page,
6587         const void      *kaddr,
6588         vm_offset_t     chunk_offset,
6589         vm_size_t       chunk_size,
6590         boolean_t       *validated_p,
6591         unsigned        *tainted_p)
6592 {
6593         vm_object_t             object;
6594         vm_object_offset_t      offset, offset_in_page;
6595         memory_object_t         pager;
6596         struct vnode            *vnode;
6597         boolean_t               validated;
6598         unsigned                tainted;
6599
6600         *validated_p = FALSE;
6601         *tainted_p = 0;
6602
6603         assert(page->vmp_busy);
6604         object = VM_PAGE_OBJECT(page);
6605         vm_object_lock_assert_exclusive(object);
6606
6607         assert(object->code_signed);
6608         offset = page->vmp_offset;
6609
6610         if (!object->alive || object->terminating || object->pager == NULL) {
6611                 /*
6612                  * The object is terminating and we don't have its pager
6613                  * so we can't validate the data...
6614                  */
6615                 return;
6616         }
6617         /*
6618          * Since we get here to validate a page that was brought in by
6619          * the pager, we know that this pager is all setup and ready
6620          * by now.
6621          */
6622         assert(!object->internal);
6623         assert(object->pager != NULL);
6624         assert(object->pager_ready);
6625
6626         pager = object->pager;
6627         assert(object->paging_in_progress);
6628         vnode = vnode_pager_lookup_vnode(pager);
6629
6630         /* verify the signature for this chunk */
6631         offset_in_page = chunk_offset;
6632         assert(offset_in_page < PAGE_SIZE);
6633
6634         tainted = 0;
6635         validated = cs_validate_range(vnode,
6636             pager,
6637             (object->paging_offset +
6638             offset +
6639             offset_in_page),
6640             (const void *)((const char *)kaddr
6641             + offset_in_page),
6642             chunk_size,
6643             &tainted);
6644         if (validated) {
6645                 *validated_p = TRUE;
6646         }
6647         if (tainted) {
6648                 *tainted_p = tainted;
6649         }
6650 }
6651
6652 static void
6653 vm_rtfrecord_lock(void)
6654 {
6655         lck_spin_lock(&vm_rtfr_slock);
6656 }
6657
6658 static void
6659 vm_rtfrecord_unlock(void)
6660 {
6661         lck_spin_unlock(&vm_rtfr_slock);
6662 }
6663
6664 unsigned int
6665 vmrtfaultinfo_bufsz(void)
6666 {
6667         return vmrtf_num_records * sizeof(vm_rtfault_record_t);
6668 }
6669
6670 #include <kern/backtrace.h>
6671
6672 static void
6673 vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
6674 {
6675         uint64_t fend = mach_continuous_time();
6676
6677         uint64_t cfpc = 0;
6678         uint64_t ctid = cthread->thread_id;
6679         uint64_t cupid = get_current_unique_pid();
6680
6681         uintptr_t bpc = 0;
6682         uint32_t bfrs = 0;
6683         bool u64 = false;
6684
6685         /* Capture a single-frame backtrace; this extracts just the program
6686          * counter at the point of the fault into "bpc", and should perform no
6687          * further user stack traversals, thus avoiding copyin()s and further
6688          * faults.
6689          */
6690         int btr = backtrace_thread_user(cthread, &bpc, 1U, &bfrs, &u64, NULL);
6691
6692         if ((btr == 0) && (bfrs > 0)) {
6693                 cfpc = bpc;
6694         }
6695
6696         assert((fstart != 0) && fend >= fstart);
6697         vm_rtfrecord_lock();
6698         assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
6699
6700         vmrtfrs.vmrtf_total++;
6701         vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
6702
6703         cvmr->rtfabstime = fstart;
6704         cvmr->rtfduration = fend - fstart;
6705         cvmr->rtfaddr = fault_vaddr;
6706         cvmr->rtfpc = cfpc;
6707         cvmr->rtftype = type_of_fault;
6708         cvmr->rtfupid = cupid;
6709         cvmr->rtftid = ctid;
6710
6711         if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
6712                 vmrtfrs.vmrtfr_curi = 0;
6713         }
6714
6715         vm_rtfrecord_unlock();
6716 }
6717
6718 int
6719 vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, int vrecordsz, void *vrecords, int *vmrtfrv)
6720 {
6721         vm_rtfault_record_t *cvmrd = vrecords;
6722         size_t residue = vrecordsz;
6723         int numextracted = 0;
6724         boolean_t early_exit = FALSE;
6725
6726         vm_rtfrecord_lock();
6727
6728         for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
6729                 if (residue < sizeof(vm_rtfault_record_t)) {
6730                         early_exit = TRUE;
6731                         break;
6732                 }
6733
6734                 if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
6735 #if     DEVELOPMENT || DEBUG
6736                         if (isroot == FALSE) {
6737                                 continue;
6738                         }
6739 #else
6740                         continue;
6741 #endif /* DEVDEBUG */
6742                 }
6743
6744                 *cvmrd = vmrtfrs.vm_rtf_records[vmfi];
6745                 cvmrd++;
6746                 residue -= sizeof(vm_rtfault_record_t);
6747                 numextracted++;
6748         }
6749
6750         vm_rtfrecord_unlock();
6751
6752         *vmrtfrv = numextracted;
6753         return early_exit;
6754 }