osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/mach_param.h>
  86 #include <kern/macro_help.h>
  87 #include <kern/zalloc.h>
  88 #include <kern/misc_protos.h>
  89 #include <kern/policy_internal.h>
  90
  91 #include <vm/vm_compressor.h>
  92 #include <vm/vm_compressor_pager.h>
  93 #include <vm/vm_fault.h>
  94 #include <vm/vm_map.h>
  95 #include <vm/vm_object.h>
  96 #include <vm/vm_page.h>
  97 #include <vm/vm_kern.h>
  98 #include <vm/pmap.h>
  99 #include <vm/vm_pageout.h>
 100 #include <vm/vm_protos.h>
 101 #include <vm/vm_external.h>
 102 #include <vm/memory_object.h>
 103 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 104 #include <vm/vm_shared_region.h>
 105
 106 #include <sys/codesign.h>
 107 #include <sys/reason.h>
 108 #include <sys/signalvar.h>
 109
 110 #include <san/kasan.h>
 111
 112 #define VM_FAULT_CLASSIFY       0
 113
 114 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 115
 116 int vm_protect_privileged_from_untrusted = 1;
 117
 118 unsigned int    vm_object_pagein_throttle = 16;
 119
 120 /*
 121  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 122  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 123  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 124  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 125  * keep the UI active so that the user has a chance to kill the offending task before the system
 126  * completely hangs.
 127  *
 128  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 129  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 130  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 131  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 132  */
 133
 134 extern void throttle_lowpri_io(int);
 135
 136 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
 137
 138 uint64_t vm_hard_throttle_threshold;
 139
 140
 141
 142 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (vm_wants_task_throttled(current_task()) ||     \
 143                                                  ((vm_page_free_count < vm_page_throttle_limit || \
 144                                                    HARD_THROTTLE_LIMIT_REACHED()) && \
 145                                                   proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
 146
 147
 148 #define HARD_THROTTLE_DELAY     10000   /* 10000 us == 10 ms */
 149 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
 150
 151 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
 152 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
 153
 154
 155 #define VM_STAT_DECOMPRESSIONS()        \
 156 MACRO_BEGIN                             \
 157         VM_STAT_INCR(decompressions);       \
 158         current_thread()->decompressions++; \
 159 MACRO_END
 160
 161 boolean_t current_thread_aborted(void);
 162
 163 /* Forward declarations of internal routines. */
 164 static kern_return_t vm_fault_wire_fast(
 165         vm_map_t        map,
 166         vm_map_offset_t va,
 167         vm_prot_t       prot,
 168         vm_tag_t        wire_tag,
 169         vm_map_entry_t  entry,
 170         pmap_t          pmap,
 171         vm_map_offset_t pmap_addr,
 172         ppnum_t         *physpage_p);
 173
 174 static kern_return_t vm_fault_internal(
 175         vm_map_t        map,
 176         vm_map_offset_t vaddr,
 177         vm_prot_t       caller_prot,
 178         boolean_t       change_wiring,
 179         vm_tag_t        wire_tag,
 180         int             interruptible,
 181         pmap_t          pmap,
 182         vm_map_offset_t pmap_addr,
 183         ppnum_t         *physpage_p);
 184
 185 static void vm_fault_copy_cleanup(
 186         vm_page_t       page,
 187         vm_page_t       top_page);
 188
 189 static void vm_fault_copy_dst_cleanup(
 190         vm_page_t       page);
 191
 192 #if     VM_FAULT_CLASSIFY
 193 extern void vm_fault_classify(vm_object_t       object,
 194     vm_object_offset_t    offset,
 195     vm_prot_t             fault_type);
 196
 197 extern void vm_fault_classify_init(void);
 198 #endif
 199
 200 unsigned long vm_pmap_enter_blocked = 0;
 201 unsigned long vm_pmap_enter_retried = 0;
 202
 203 unsigned long vm_cs_validates = 0;
 204 unsigned long vm_cs_revalidates = 0;
 205 unsigned long vm_cs_query_modified = 0;
 206 unsigned long vm_cs_validated_dirtied = 0;
 207 unsigned long vm_cs_bitmap_validated = 0;
 208 #if PMAP_CS
 209 uint64_t vm_cs_defer_to_pmap_cs = 0;
 210 uint64_t vm_cs_defer_to_pmap_cs_not = 0;
 211 #endif /* PMAP_CS */
 212
 213 void vm_pre_fault(vm_map_offset_t, vm_prot_t);
 214
 215 extern char *kdp_compressor_decompressed_page;
 216 extern addr64_t kdp_compressor_decompressed_page_paddr;
 217 extern ppnum_t  kdp_compressor_decompressed_page_ppnum;
 218
 219 struct vmrtfr {
 220         int vmrtfr_maxi;
 221         int vmrtfr_curi;
 222         int64_t vmrtf_total;
 223         vm_rtfault_record_t *vm_rtf_records;
 224 } vmrtfrs;
 225 #define VMRTF_DEFAULT_BUFSIZE (4096)
 226 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
 227 int vmrtf_num_records = VMRTF_NUM_RECORDS_DEFAULT;
 228
 229 static void vm_rtfrecord_lock(void);
 230 static void vm_rtfrecord_unlock(void);
 231 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
 232
 233 lck_spin_t vm_rtfr_slock;
 234 extern lck_grp_t vm_page_lck_grp_bucket;
 235 extern lck_attr_t vm_page_lck_attr;
 236
 237 /*
 238  *      Routine:        vm_fault_init
 239  *      Purpose:
 240  *              Initialize our private data structures.
 241  */
 242 void
 243 vm_fault_init(void)
 244 {
 245         int i, vm_compressor_temp;
 246         boolean_t need_default_val = TRUE;
 247         /*
 248          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 249          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 250          * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
 251          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 252          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 253          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 254          */
 255
 256         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
 257
 258         /*
 259          * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
 260          */
 261
 262         if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
 263                 for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
 264                         if (vm_compressor_temp > 0 &&
 265                             ((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
 266                                 need_default_val = FALSE;
 267                                 vm_compressor_mode = vm_compressor_temp;
 268                                 break;
 269                         }
 270                 }
 271                 if (need_default_val) {
 272                         printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
 273                 }
 274         }
 275         if (need_default_val) {
 276                 /* If no boot arg or incorrect boot arg, try device tree. */
 277                 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
 278         }
 279         printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
 280
 281         PE_parse_boot_argn("vm_protect_privileged_from_untrusted", &vm_protect_privileged_from_untrusted, sizeof(vm_protect_privileged_from_untrusted));
 282 }
 283
 284 void
 285 vm_rtfault_record_init(void)
 286 {
 287         PE_parse_boot_argn("vm_rtfault_records", &vmrtf_num_records, sizeof(vmrtf_num_records));
 288
 289         assert(vmrtf_num_records >= 1);
 290         vmrtf_num_records = MAX(vmrtf_num_records, 1);
 291         size_t kallocsz = vmrtf_num_records * sizeof(vm_rtfault_record_t);
 292         vmrtfrs.vm_rtf_records = kalloc(kallocsz);
 293         bzero(vmrtfrs.vm_rtf_records, kallocsz);
 294         vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
 295         lck_spin_init(&vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
 296 }
 297 /*
 298  *      Routine:        vm_fault_cleanup
 299  *      Purpose:
 300  *              Clean up the result of vm_fault_page.
 301  *      Results:
 302  *              The paging reference for "object" is released.
 303  *              "object" is unlocked.
 304  *              If "top_page" is not null,  "top_page" is
 305  *              freed and the paging reference for the object
 306  *              containing it is released.
 307  *
 308  *      In/out conditions:
 309  *              "object" must be locked.
 310  */
 311 void
 312 vm_fault_cleanup(
 313         vm_object_t     object,
 314         vm_page_t       top_page)
 315 {
 316         vm_object_paging_end(object);
 317         vm_object_unlock(object);
 318
 319         if (top_page != VM_PAGE_NULL) {
 320                 object = VM_PAGE_OBJECT(top_page);
 321
 322                 vm_object_lock(object);
 323                 VM_PAGE_FREE(top_page);
 324                 vm_object_paging_end(object);
 325                 vm_object_unlock(object);
 326         }
 327 }
 328
 329 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 330
 331
 332 boolean_t       vm_page_deactivate_behind = TRUE;
 333 /*
 334  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 335  */
 336 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 337 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 338                                                                 /* we use it to size an array on the stack */
 339
 340 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 341
 342 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 343
 344 /*
 345  * vm_page_is_sequential
 346  *
 347  * Determine if sequential access is in progress
 348  * in accordance with the behavior specified.
 349  * Update state to indicate current access pattern.
 350  *
 351  * object must have at least the shared lock held
 352  */
 353 static
 354 void
 355 vm_fault_is_sequential(
 356         vm_object_t             object,
 357         vm_object_offset_t      offset,
 358         vm_behavior_t           behavior)
 359 {
 360         vm_object_offset_t      last_alloc;
 361         int                     sequential;
 362         int                     orig_sequential;
 363
 364         last_alloc = object->last_alloc;
 365         sequential = object->sequential;
 366         orig_sequential = sequential;
 367
 368         switch (behavior) {
 369         case VM_BEHAVIOR_RANDOM:
 370                 /*
 371                  * reset indicator of sequential behavior
 372                  */
 373                 sequential = 0;
 374                 break;
 375
 376         case VM_BEHAVIOR_SEQUENTIAL:
 377                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 378                         /*
 379                          * advance indicator of sequential behavior
 380                          */
 381                         if (sequential < MAX_SEQUENTIAL_RUN) {
 382                                 sequential += PAGE_SIZE;
 383                         }
 384                 } else {
 385                         /*
 386                          * reset indicator of sequential behavior
 387                          */
 388                         sequential = 0;
 389                 }
 390                 break;
 391
 392         case VM_BEHAVIOR_RSEQNTL:
 393                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 394                         /*
 395                          * advance indicator of sequential behavior
 396                          */
 397                         if (sequential > -MAX_SEQUENTIAL_RUN) {
 398                                 sequential -= PAGE_SIZE;
 399                         }
 400                 } else {
 401                         /*
 402                          * reset indicator of sequential behavior
 403                          */
 404                         sequential = 0;
 405                 }
 406                 break;
 407
 408         case VM_BEHAVIOR_DEFAULT:
 409         default:
 410                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 411                         /*
 412                          * advance indicator of sequential behavior
 413                          */
 414                         if (sequential < 0) {
 415                                 sequential = 0;
 416                         }
 417                         if (sequential < MAX_SEQUENTIAL_RUN) {
 418                                 sequential += PAGE_SIZE;
 419                         }
 420                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 421                         /*
 422                          * advance indicator of sequential behavior
 423                          */
 424                         if (sequential > 0) {
 425                                 sequential = 0;
 426                         }
 427                         if (sequential > -MAX_SEQUENTIAL_RUN) {
 428                                 sequential -= PAGE_SIZE;
 429                         }
 430                 } else {
 431                         /*
 432                          * reset indicator of sequential behavior
 433                          */
 434                         sequential = 0;
 435                 }
 436                 break;
 437         }
 438         if (sequential != orig_sequential) {
 439                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 440                         /*
 441                          * if someone else has already updated object->sequential
 442                          * don't bother trying to update it or object->last_alloc
 443                          */
 444                         return;
 445                 }
 446         }
 447         /*
 448          * I'd like to do this with a OSCompareAndSwap64, but that
 449          * doesn't exist for PPC...  however, it shouldn't matter
 450          * that much... last_alloc is maintained so that we can determine
 451          * if a sequential access pattern is taking place... if only
 452          * one thread is banging on this object, no problem with the unprotected
 453          * update... if 2 or more threads are banging away, we run the risk of
 454          * someone seeing a mangled update... however, in the face of multiple
 455          * accesses, no sequential access pattern can develop anyway, so we
 456          * haven't lost any real info.
 457          */
 458         object->last_alloc = offset;
 459 }
 460
 461
 462 int vm_page_deactivate_behind_count = 0;
 463
 464 /*
 465  * vm_page_deactivate_behind
 466  *
 467  * Determine if sequential access is in progress
 468  * in accordance with the behavior specified.  If
 469  * so, compute a potential page to deactivate and
 470  * deactivate it.
 471  *
 472  * object must be locked.
 473  *
 474  * return TRUE if we actually deactivate a page
 475  */
 476 static
 477 boolean_t
 478 vm_fault_deactivate_behind(
 479         vm_object_t             object,
 480         vm_object_offset_t      offset,
 481         vm_behavior_t           behavior)
 482 {
 483         int             n;
 484         int             pages_in_run = 0;
 485         int             max_pages_in_run = 0;
 486         int             sequential_run;
 487         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 488         vm_object_offset_t      run_offset = 0;
 489         vm_object_offset_t      pg_offset = 0;
 490         vm_page_t       m;
 491         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 492
 493         pages_in_run = 0;
 494 #if TRACEFAULTPAGE
 495         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 496 #endif
 497
 498         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 499                 /*
 500                  * Do not deactivate pages from the kernel object: they
 501                  * are not intended to become pageable.
 502                  * or we've disabled the deactivate behind mechanism
 503                  */
 504                 return FALSE;
 505         }
 506         if ((sequential_run = object->sequential)) {
 507                 if (sequential_run < 0) {
 508                         sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 509                         sequential_run = 0 - sequential_run;
 510                 } else {
 511                         sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 512                 }
 513         }
 514         switch (behavior) {
 515         case VM_BEHAVIOR_RANDOM:
 516                 break;
 517         case VM_BEHAVIOR_SEQUENTIAL:
 518                 if (sequential_run >= (int)PAGE_SIZE) {
 519                         run_offset = 0 - PAGE_SIZE_64;
 520                         max_pages_in_run = 1;
 521                 }
 522                 break;
 523         case VM_BEHAVIOR_RSEQNTL:
 524                 if (sequential_run >= (int)PAGE_SIZE) {
 525                         run_offset = PAGE_SIZE_64;
 526                         max_pages_in_run = 1;
 527                 }
 528                 break;
 529         case VM_BEHAVIOR_DEFAULT:
 530         default:
 531         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 532
 533                 /*
 534                  * determine if the run of sequential accesss has been
 535                  * long enough on an object with default access behavior
 536                  * to consider it for deactivation
 537                  */
 538                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 539                         /*
 540                          * the comparisons between offset and behind are done
 541                          * in this kind of odd fashion in order to prevent wrap around
 542                          * at the end points
 543                          */
 544                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 545                                 if (offset >= behind) {
 546                                         run_offset = 0 - behind;
 547                                         pg_offset = PAGE_SIZE_64;
 548                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 549                                 }
 550                         } else {
 551                                 if (offset < -behind) {
 552                                         run_offset = behind;
 553                                         pg_offset = 0 - PAGE_SIZE_64;
 554                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 555                                 }
 556                         }
 557                 }
 558                 break;}
 559         }
 560         for (n = 0; n < max_pages_in_run; n++) {
 561                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 562
 563                 if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
 564                         page_run[pages_in_run++] = m;
 565
 566                         /*
 567                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
 568                          *
 569                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
 570                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
 571                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
 572                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
 573                          * by pageout_scan, which is just fine since the last reference would have happened quite far
 574                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
 575                          * have happened before we did the deactivate_behind.
 576                          */
 577                         pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
 578                 }
 579         }
 580         if (pages_in_run) {
 581                 vm_page_lockspin_queues();
 582
 583                 for (n = 0; n < pages_in_run; n++) {
 584                         m = page_run[n];
 585
 586                         vm_page_deactivate_internal(m, FALSE);
 587
 588                         vm_page_deactivate_behind_count++;
 589 #if TRACEFAULTPAGE
 590                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 591 #endif
 592                 }
 593                 vm_page_unlock_queues();
 594
 595                 return TRUE;
 596         }
 597         return FALSE;
 598 }
 599
 600
 601 #if (DEVELOPMENT || DEBUG)
 602 uint32_t        vm_page_creation_throttled_hard = 0;
 603 uint32_t        vm_page_creation_throttled_soft = 0;
 604 uint64_t        vm_page_creation_throttle_avoided = 0;
 605 #endif /* DEVELOPMENT || DEBUG */
 606
 607 static int
 608 vm_page_throttled(boolean_t page_kept)
 609 {
 610         clock_sec_t     elapsed_sec;
 611         clock_sec_t     tv_sec;
 612         clock_usec_t    tv_usec;
 613
 614         thread_t thread = current_thread();
 615
 616         if (thread->options & TH_OPT_VMPRIV) {
 617                 return 0;
 618         }
 619
 620         if (thread->t_page_creation_throttled) {
 621                 thread->t_page_creation_throttled = 0;
 622
 623                 if (page_kept == FALSE) {
 624                         goto no_throttle;
 625                 }
 626         }
 627         if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
 628 #if (DEVELOPMENT || DEBUG)
 629                 thread->t_page_creation_throttled_hard++;
 630                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 631 #endif /* DEVELOPMENT || DEBUG */
 632                 return HARD_THROTTLE_DELAY;
 633         }
 634
 635         if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
 636             thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
 637                 if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
 638 #if (DEVELOPMENT || DEBUG)
 639                         OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
 640 #endif
 641                         goto no_throttle;
 642                 }
 643                 clock_get_system_microtime(&tv_sec, &tv_usec);
 644
 645                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 646
 647                 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
 648                     (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
 649                         if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
 650                                 /*
 651                                  * we'll reset our stats to give a well behaved app
 652                                  * that was unlucky enough to accumulate a bunch of pages
 653                                  * over a long period of time a chance to get out of
 654                                  * the throttled state... we reset the counter and timestamp
 655                                  * so that if it stays under the rate limit for the next second
 656                                  * it will be back in our good graces... if it exceeds it, it
 657                                  * will remain in the throttled state
 658                                  */
 659                                 thread->t_page_creation_time = tv_sec;
 660                                 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
 661                         }
 662                         VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
 663
 664                         thread->t_page_creation_throttled = 1;
 665
 666                         if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
 667 #if (DEVELOPMENT || DEBUG)
 668                                 thread->t_page_creation_throttled_hard++;
 669                                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 670 #endif /* DEVELOPMENT || DEBUG */
 671                                 return HARD_THROTTLE_DELAY;
 672                         } else {
 673 #if (DEVELOPMENT || DEBUG)
 674                                 thread->t_page_creation_throttled_soft++;
 675                                 OSAddAtomic(1, &vm_page_creation_throttled_soft);
 676 #endif /* DEVELOPMENT || DEBUG */
 677                                 return SOFT_THROTTLE_DELAY;
 678                         }
 679                 }
 680                 thread->t_page_creation_time = tv_sec;
 681                 thread->t_page_creation_count = 0;
 682         }
 683 no_throttle:
 684         thread->t_page_creation_count++;
 685
 686         return 0;
 687 }
 688
 689
 690 /*
 691  * check for various conditions that would
 692  * prevent us from creating a ZF page...
 693  * cleanup is based on being called from vm_fault_page
 694  *
 695  * object must be locked
 696  * object == m->vmp_object
 697  */
 698 static vm_fault_return_t
 699 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
 700 {
 701         int throttle_delay;
 702
 703         if (object->shadow_severed ||
 704             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 705                 /*
 706                  * Either:
 707                  * 1. the shadow chain was severed,
 708                  * 2. the purgeable object is volatile or empty and is marked
 709                  *    to fault on access while volatile.
 710                  * Just have to return an error at this point
 711                  */
 712                 if (m != VM_PAGE_NULL) {
 713                         VM_PAGE_FREE(m);
 714                 }
 715                 vm_fault_cleanup(object, first_m);
 716
 717                 thread_interrupt_level(interruptible_state);
 718
 719                 return VM_FAULT_MEMORY_ERROR;
 720         }
 721         if (page_throttle == TRUE) {
 722                 if ((throttle_delay = vm_page_throttled(FALSE))) {
 723                         /*
 724                          * we're throttling zero-fills...
 725                          * treat this as if we couldn't grab a page
 726                          */
 727                         if (m != VM_PAGE_NULL) {
 728                                 VM_PAGE_FREE(m);
 729                         }
 730                         vm_fault_cleanup(object, first_m);
 731
 732                         VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 733
 734                         delay(throttle_delay);
 735
 736                         if (current_thread_aborted()) {
 737                                 thread_interrupt_level(interruptible_state);
 738                                 return VM_FAULT_INTERRUPTED;
 739                         }
 740                         thread_interrupt_level(interruptible_state);
 741
 742                         return VM_FAULT_MEMORY_SHORTAGE;
 743                 }
 744         }
 745         return VM_FAULT_SUCCESS;
 746 }
 747
 748
 749 /*
 750  * do the work to zero fill a page and
 751  * inject it into the correct paging queue
 752  *
 753  * m->vmp_object must be locked
 754  * page queue lock must NOT be held
 755  */
 756 static int
 757 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 758 {
 759         int my_fault = DBG_ZERO_FILL_FAULT;
 760         vm_object_t     object;
 761
 762         object = VM_PAGE_OBJECT(m);
 763
 764         /*
 765          * This is is a zero-fill page fault...
 766          *
 767          * Checking the page lock is a waste of
 768          * time;  this page was absent, so
 769          * it can't be page locked by a pager.
 770          *
 771          * we also consider it undefined
 772          * with respect to instruction
 773          * execution.  i.e. it is the responsibility
 774          * of higher layers to call for an instruction
 775          * sync after changing the contents and before
 776          * sending a program into this area.  We
 777          * choose this approach for performance
 778          */
 779         m->vmp_pmapped = TRUE;
 780
 781         m->vmp_cs_validated = FALSE;
 782         m->vmp_cs_tainted = FALSE;
 783         m->vmp_cs_nx = FALSE;
 784
 785         if (no_zero_fill == TRUE) {
 786                 my_fault = DBG_NZF_PAGE_FAULT;
 787
 788                 if (m->vmp_absent && m->vmp_busy) {
 789                         return my_fault;
 790                 }
 791         } else {
 792                 vm_page_zero_fill(m);
 793
 794                 VM_STAT_INCR(zero_fill_count);
 795                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 796         }
 797         assert(!m->vmp_laundry);
 798         assert(object != kernel_object);
 799         //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
 800
 801         if (!VM_DYNAMIC_PAGING_ENABLED() &&
 802             (object->purgable == VM_PURGABLE_DENY ||
 803             object->purgable == VM_PURGABLE_NONVOLATILE ||
 804             object->purgable == VM_PURGABLE_VOLATILE)) {
 805                 vm_page_lockspin_queues();
 806
 807                 if (!VM_DYNAMIC_PAGING_ENABLED()) {
 808                         assert(!VM_PAGE_WIRED(m));
 809
 810                         /*
 811                          * can't be on the pageout queue since we don't
 812                          * have a pager to try and clean to
 813                          */
 814                         vm_page_queues_remove(m, TRUE);
 815                         vm_page_check_pageable_safe(m);
 816                         vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
 817                         m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
 818                         vm_page_throttled_count++;
 819                 }
 820                 vm_page_unlock_queues();
 821         }
 822         return my_fault;
 823 }
 824
 825
 826 /*
 827  *      Routine:        vm_fault_page
 828  *      Purpose:
 829  *              Find the resident page for the virtual memory
 830  *              specified by the given virtual memory object
 831  *              and offset.
 832  *      Additional arguments:
 833  *              The required permissions for the page is given
 834  *              in "fault_type".  Desired permissions are included
 835  *              in "protection".
 836  *              fault_info is passed along to determine pagein cluster
 837  *              limits... it contains the expected reference pattern,
 838  *              cluster size if available, etc...
 839  *
 840  *              If the desired page is known to be resident (for
 841  *              example, because it was previously wired down), asserting
 842  *              the "unwiring" parameter will speed the search.
 843  *
 844  *              If the operation can be interrupted (by thread_abort
 845  *              or thread_terminate), then the "interruptible"
 846  *              parameter should be asserted.
 847  *
 848  *      Results:
 849  *              The page containing the proper data is returned
 850  *              in "result_page".
 851  *
 852  *      In/out conditions:
 853  *              The source object must be locked and referenced,
 854  *              and must donate one paging reference.  The reference
 855  *              is not affected.  The paging reference and lock are
 856  *              consumed.
 857  *
 858  *              If the call succeeds, the object in which "result_page"
 859  *              resides is left locked and holding a paging reference.
 860  *              If this is not the original object, a busy page in the
 861  *              original object is returned in "top_page", to prevent other
 862  *              callers from pursuing this same data, along with a paging
 863  *              reference for the original object.  The "top_page" should
 864  *              be destroyed when this guarantee is no longer required.
 865  *              The "result_page" is also left busy.  It is not removed
 866  *              from the pageout queues.
 867  *      Special Case:
 868  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 869  *              fault succeeded but there's no VM page (i.e. the VM object
 870  *              does not actually hold VM pages, but device memory or
 871  *              large pages).  The object is still locked and we still hold a
 872  *              paging_in_progress reference.
 873  */
 874 unsigned int vm_fault_page_blocked_access = 0;
 875 unsigned int vm_fault_page_forced_retry = 0;
 876
 877 vm_fault_return_t
 878 vm_fault_page(
 879         /* Arguments: */
 880         vm_object_t     first_object,   /* Object to begin search */
 881         vm_object_offset_t first_offset,        /* Offset into object */
 882         vm_prot_t       fault_type,     /* What access is requested */
 883         boolean_t       must_be_resident,/* Must page be resident? */
 884         boolean_t       caller_lookup,  /* caller looked up page */
 885         /* Modifies in place: */
 886         vm_prot_t       *protection,    /* Protection for mapping */
 887         vm_page_t       *result_page,   /* Page found, if successful */
 888         /* Returns: */
 889         vm_page_t       *top_page,      /* Page in top object, if
 890                                          * not result_page.  */
 891         int             *type_of_fault, /* if non-null, fill in with type of fault
 892                                          * COW, zero-fill, etc... returned in trace point */
 893         /* More arguments: */
 894         kern_return_t   *error_code,    /* code if page is in error */
 895         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 896         boolean_t       data_supply,    /* treat as data_supply if
 897                                          * it is a write fault and a full
 898                                          * page is provided */
 899         vm_object_fault_info_t fault_info)
 900 {
 901         vm_page_t               m;
 902         vm_object_t             object;
 903         vm_object_offset_t      offset;
 904         vm_page_t               first_m;
 905         vm_object_t             next_object;
 906         vm_object_t             copy_object;
 907         boolean_t               look_for_page;
 908         boolean_t               force_fault_retry = FALSE;
 909         vm_prot_t               access_required = fault_type;
 910         vm_prot_t               wants_copy_flag;
 911         kern_return_t           wait_result;
 912         wait_interrupt_t        interruptible_state;
 913         boolean_t               data_already_requested = FALSE;
 914         vm_behavior_t           orig_behavior;
 915         vm_size_t               orig_cluster_size;
 916         vm_fault_return_t       error;
 917         int                     my_fault;
 918         uint32_t                try_failed_count;
 919         int                     interruptible; /* how may fault be interrupted? */
 920         int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
 921         memory_object_t         pager;
 922         vm_fault_return_t       retval;
 923         int                     grab_options;
 924
 925 /*
 926  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 927  * marked as paged out in the compressor pager or the pager doesn't exist.
 928  * Note also that if the pager for an internal object
 929  * has not been created, the pager is not invoked regardless of the value
 930  * of MUST_ASK_PAGER().
 931  *
 932  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 933  * is marked as paged out in the compressor pager.
 934  * PAGED_OUT() is used to determine if a page has already been pushed
 935  * into a copy object in order to avoid a redundant page out operation.
 936  */
 937 #define MUST_ASK_PAGER(o, f, s)                                 \
 938         ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
 939
 940 #define PAGED_OUT(o, f) \
 941         (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
 942
 943 /*
 944  *      Recovery actions
 945  */
 946 #define RELEASE_PAGE(m)                                 \
 947         MACRO_BEGIN                                     \
 948         PAGE_WAKEUP_DONE(m);                            \
 949         if ( !VM_PAGE_PAGEABLE(m)) {                    \
 950                 vm_page_lockspin_queues();              \
 951                 if ( !VM_PAGE_PAGEABLE(m)) {            \
 952                         if (VM_CONFIG_COMPRESSOR_IS_ACTIVE)     \
 953                                 vm_page_deactivate(m);          \
 954                         else                                    \
 955                                 vm_page_activate(m);            \
 956                 }                                               \
 957                 vm_page_unlock_queues();                        \
 958         }                                                       \
 959         MACRO_END
 960
 961 #if TRACEFAULTPAGE
 962         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 963 #endif
 964
 965         interruptible = fault_info->interruptible;
 966         interruptible_state = thread_interrupt_level(interruptible);
 967
 968         /*
 969          *      INVARIANTS (through entire routine):
 970          *
 971          *      1)      At all times, we must either have the object
 972          *              lock or a busy page in some object to prevent
 973          *              some other thread from trying to bring in
 974          *              the same page.
 975          *
 976          *              Note that we cannot hold any locks during the
 977          *              pager access or when waiting for memory, so
 978          *              we use a busy page then.
 979          *
 980          *      2)      To prevent another thread from racing us down the
 981          *              shadow chain and entering a new page in the top
 982          *              object before we do, we must keep a busy page in
 983          *              the top object while following the shadow chain.
 984          *
 985          *      3)      We must increment paging_in_progress on any object
 986          *              for which we have a busy page before dropping
 987          *              the object lock
 988          *
 989          *      4)      We leave busy pages on the pageout queues.
 990          *              If the pageout daemon comes across a busy page,
 991          *              it will remove the page from the pageout queues.
 992          */
 993
 994         object = first_object;
 995         offset = first_offset;
 996         first_m = VM_PAGE_NULL;
 997         access_required = fault_type;
 998
 999         /*
1000          * default type of fault
1001          */
1002         my_fault = DBG_CACHE_HIT_FAULT;
1003
1004         while (TRUE) {
1005 #if TRACEFAULTPAGE
1006                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1007 #endif
1008
1009                 grab_options = 0;
1010 #if CONFIG_SECLUDED_MEMORY
1011                 if (object->can_grab_secluded) {
1012                         grab_options |= VM_PAGE_GRAB_SECLUDED;
1013                 }
1014 #endif /* CONFIG_SECLUDED_MEMORY */
1015
1016                 if (!object->alive) {
1017                         /*
1018                          * object is no longer valid
1019                          * clean up and return error
1020                          */
1021                         vm_fault_cleanup(object, first_m);
1022                         thread_interrupt_level(interruptible_state);
1023
1024                         return VM_FAULT_MEMORY_ERROR;
1025                 }
1026
1027                 if (!object->pager_created && object->phys_contiguous) {
1028                         /*
1029                          * A physically-contiguous object without a pager:
1030                          * must be a "large page" object.  We do not deal
1031                          * with VM pages for this object.
1032                          */
1033                         caller_lookup = FALSE;
1034                         m = VM_PAGE_NULL;
1035                         goto phys_contig_object;
1036                 }
1037
1038                 if (object->blocked_access) {
1039                         /*
1040                          * Access to this VM object has been blocked.
1041                          * Replace our "paging_in_progress" reference with
1042                          * a "activity_in_progress" reference and wait for
1043                          * access to be unblocked.
1044                          */
1045                         caller_lookup = FALSE; /* no longer valid after sleep */
1046                         vm_object_activity_begin(object);
1047                         vm_object_paging_end(object);
1048                         while (object->blocked_access) {
1049                                 vm_object_sleep(object,
1050                                     VM_OBJECT_EVENT_UNBLOCKED,
1051                                     THREAD_UNINT);
1052                         }
1053                         vm_fault_page_blocked_access++;
1054                         vm_object_paging_begin(object);
1055                         vm_object_activity_end(object);
1056                 }
1057
1058                 /*
1059                  * See whether the page at 'offset' is resident
1060                  */
1061                 if (caller_lookup == TRUE) {
1062                         /*
1063                          * The caller has already looked up the page
1064                          * and gave us the result in "result_page".
1065                          * We can use this for the first lookup but
1066                          * it loses its validity as soon as we unlock
1067                          * the object.
1068                          */
1069                         m = *result_page;
1070                         caller_lookup = FALSE; /* no longer valid after that */
1071                 } else {
1072                         m = vm_page_lookup(object, offset);
1073                 }
1074 #if TRACEFAULTPAGE
1075                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1076 #endif
1077                 if (m != VM_PAGE_NULL) {
1078                         if (m->vmp_busy) {
1079                                 /*
1080                                  * The page is being brought in,
1081                                  * wait for it and then retry.
1082                                  */
1083 #if TRACEFAULTPAGE
1084                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1085 #endif
1086                                 wait_result = PAGE_SLEEP(object, m, interruptible);
1087
1088                                 counter(c_vm_fault_page_block_busy_kernel++);
1089
1090                                 if (wait_result != THREAD_AWAKENED) {
1091                                         vm_fault_cleanup(object, first_m);
1092                                         thread_interrupt_level(interruptible_state);
1093
1094                                         if (wait_result == THREAD_RESTART) {
1095                                                 return VM_FAULT_RETRY;
1096                                         } else {
1097                                                 return VM_FAULT_INTERRUPTED;
1098                                         }
1099                                 }
1100                                 continue;
1101                         }
1102                         if (m->vmp_laundry) {
1103                                 m->vmp_free_when_done = FALSE;
1104
1105                                 if (!m->vmp_cleaning) {
1106                                         vm_pageout_steal_laundry(m, FALSE);
1107                                 }
1108                         }
1109                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1110                                 /*
1111                                  * Guard page: off limits !
1112                                  */
1113                                 if (fault_type == VM_PROT_NONE) {
1114                                         /*
1115                                          * The fault is not requesting any
1116                                          * access to the guard page, so it must
1117                                          * be just to wire or unwire it.
1118                                          * Let's pretend it succeeded...
1119                                          */
1120                                         m->vmp_busy = TRUE;
1121                                         *result_page = m;
1122                                         assert(first_m == VM_PAGE_NULL);
1123                                         *top_page = first_m;
1124                                         if (type_of_fault) {
1125                                                 *type_of_fault = DBG_GUARD_FAULT;
1126                                         }
1127                                         thread_interrupt_level(interruptible_state);
1128                                         return VM_FAULT_SUCCESS;
1129                                 } else {
1130                                         /*
1131                                          * The fault requests access to the
1132                                          * guard page: let's deny that !
1133                                          */
1134                                         vm_fault_cleanup(object, first_m);
1135                                         thread_interrupt_level(interruptible_state);
1136                                         return VM_FAULT_MEMORY_ERROR;
1137                                 }
1138                         }
1139
1140                         if (m->vmp_error) {
1141                                 /*
1142                                  * The page is in error, give up now.
1143                                  */
1144 #if TRACEFAULTPAGE
1145                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1146 #endif
1147                                 if (error_code) {
1148                                         *error_code = KERN_MEMORY_ERROR;
1149                                 }
1150                                 VM_PAGE_FREE(m);
1151
1152                                 vm_fault_cleanup(object, first_m);
1153                                 thread_interrupt_level(interruptible_state);
1154
1155                                 return VM_FAULT_MEMORY_ERROR;
1156                         }
1157                         if (m->vmp_restart) {
1158                                 /*
1159                                  * The pager wants us to restart
1160                                  * at the top of the chain,
1161                                  * typically because it has moved the
1162                                  * page to another pager, then do so.
1163                                  */
1164 #if TRACEFAULTPAGE
1165                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1166 #endif
1167                                 VM_PAGE_FREE(m);
1168
1169                                 vm_fault_cleanup(object, first_m);
1170                                 thread_interrupt_level(interruptible_state);
1171
1172                                 return VM_FAULT_RETRY;
1173                         }
1174                         if (m->vmp_absent) {
1175                                 /*
1176                                  * The page isn't busy, but is absent,
1177                                  * therefore it's deemed "unavailable".
1178                                  *
1179                                  * Remove the non-existent page (unless it's
1180                                  * in the top object) and move on down to the
1181                                  * next object (if there is one).
1182                                  */
1183 #if TRACEFAULTPAGE
1184                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1185 #endif
1186                                 next_object = object->shadow;
1187
1188                                 if (next_object == VM_OBJECT_NULL) {
1189                                         /*
1190                                          * Absent page at bottom of shadow
1191                                          * chain; zero fill the page we left
1192                                          * busy in the first object, and free
1193                                          * the absent page.
1194                                          */
1195                                         assert(!must_be_resident);
1196
1197                                         /*
1198                                          * check for any conditions that prevent
1199                                          * us from creating a new zero-fill page
1200                                          * vm_fault_check will do all of the
1201                                          * fault cleanup in the case of an error condition
1202                                          * including resetting the thread_interrupt_level
1203                                          */
1204                                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1205
1206                                         if (error != VM_FAULT_SUCCESS) {
1207                                                 return error;
1208                                         }
1209
1210                                         if (object != first_object) {
1211                                                 /*
1212                                                  * free the absent page we just found
1213                                                  */
1214                                                 VM_PAGE_FREE(m);
1215
1216                                                 /*
1217                                                  * drop reference and lock on current object
1218                                                  */
1219                                                 vm_object_paging_end(object);
1220                                                 vm_object_unlock(object);
1221
1222                                                 /*
1223                                                  * grab the original page we
1224                                                  * 'soldered' in place and
1225                                                  * retake lock on 'first_object'
1226                                                  */
1227                                                 m = first_m;
1228                                                 first_m = VM_PAGE_NULL;
1229
1230                                                 object = first_object;
1231                                                 offset = first_offset;
1232
1233                                                 vm_object_lock(object);
1234                                         } else {
1235                                                 /*
1236                                                  * we're going to use the absent page we just found
1237                                                  * so convert it to a 'busy' page
1238                                                  */
1239                                                 m->vmp_absent = FALSE;
1240                                                 m->vmp_busy = TRUE;
1241                                         }
1242                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1243                                                 m->vmp_absent = TRUE;
1244                                         }
1245                                         /*
1246                                          * zero-fill the page and put it on
1247                                          * the correct paging queue
1248                                          */
1249                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1250
1251                                         break;
1252                                 } else {
1253                                         if (must_be_resident) {
1254                                                 vm_object_paging_end(object);
1255                                         } else if (object != first_object) {
1256                                                 vm_object_paging_end(object);
1257                                                 VM_PAGE_FREE(m);
1258                                         } else {
1259                                                 first_m = m;
1260                                                 m->vmp_absent = FALSE;
1261                                                 m->vmp_busy = TRUE;
1262
1263                                                 vm_page_lockspin_queues();
1264                                                 vm_page_queues_remove(m, FALSE);
1265                                                 vm_page_unlock_queues();
1266                                         }
1267
1268                                         offset += object->vo_shadow_offset;
1269                                         fault_info->lo_offset += object->vo_shadow_offset;
1270                                         fault_info->hi_offset += object->vo_shadow_offset;
1271                                         access_required = VM_PROT_READ;
1272
1273                                         vm_object_lock(next_object);
1274                                         vm_object_unlock(object);
1275                                         object = next_object;
1276                                         vm_object_paging_begin(object);
1277
1278                                         /*
1279                                          * reset to default type of fault
1280                                          */
1281                                         my_fault = DBG_CACHE_HIT_FAULT;
1282
1283                                         continue;
1284                                 }
1285                         }
1286                         if ((m->vmp_cleaning)
1287                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1288                             && (fault_type & VM_PROT_WRITE)) {
1289                                 /*
1290                                  * This is a copy-on-write fault that will
1291                                  * cause us to revoke access to this page, but
1292                                  * this page is in the process of being cleaned
1293                                  * in a clustered pageout. We must wait until
1294                                  * the cleaning operation completes before
1295                                  * revoking access to the original page,
1296                                  * otherwise we might attempt to remove a
1297                                  * wired mapping.
1298                                  */
1299 #if TRACEFAULTPAGE
1300                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1301 #endif
1302                                 /*
1303                                  * take an extra ref so that object won't die
1304                                  */
1305                                 vm_object_reference_locked(object);
1306
1307                                 vm_fault_cleanup(object, first_m);
1308
1309                                 counter(c_vm_fault_page_block_backoff_kernel++);
1310                                 vm_object_lock(object);
1311                                 assert(object->ref_count > 0);
1312
1313                                 m = vm_page_lookup(object, offset);
1314
1315                                 if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1316                                         PAGE_ASSERT_WAIT(m, interruptible);
1317
1318                                         vm_object_unlock(object);
1319                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1320                                         vm_object_deallocate(object);
1321
1322                                         goto backoff;
1323                                 } else {
1324                                         vm_object_unlock(object);
1325
1326                                         vm_object_deallocate(object);
1327                                         thread_interrupt_level(interruptible_state);
1328
1329                                         return VM_FAULT_RETRY;
1330                                 }
1331                         }
1332                         if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1333                             !(fault_info != NULL && fault_info->stealth)) {
1334                                 /*
1335                                  * If we were passed a non-NULL pointer for
1336                                  * "type_of_fault", than we came from
1337                                  * vm_fault... we'll let it deal with
1338                                  * this condition, since it
1339                                  * needs to see m->vmp_speculative to correctly
1340                                  * account the pageins, otherwise...
1341                                  * take it off the speculative queue, we'll
1342                                  * let the caller of vm_fault_page deal
1343                                  * with getting it onto the correct queue
1344                                  *
1345                                  * If the caller specified in fault_info that
1346                                  * it wants a "stealth" fault, we also leave
1347                                  * the page in the speculative queue.
1348                                  */
1349                                 vm_page_lockspin_queues();
1350                                 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1351                                         vm_page_queues_remove(m, FALSE);
1352                                 }
1353                                 vm_page_unlock_queues();
1354                         }
1355                         assert(object == VM_PAGE_OBJECT(m));
1356
1357                         if (object->code_signed) {
1358                                 /*
1359                                  * CODE SIGNING:
1360                                  * We just paged in a page from a signed
1361                                  * memory object but we don't need to
1362                                  * validate it now.  We'll validate it if
1363                                  * when it gets mapped into a user address
1364                                  * space for the first time or when the page
1365                                  * gets copied to another object as a result
1366                                  * of a copy-on-write.
1367                                  */
1368                         }
1369
1370                         /*
1371                          * We mark the page busy and leave it on
1372                          * the pageout queues.  If the pageout
1373                          * deamon comes across it, then it will
1374                          * remove the page from the queue, but not the object
1375                          */
1376 #if TRACEFAULTPAGE
1377                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1378 #endif
1379                         assert(!m->vmp_busy);
1380                         assert(!m->vmp_absent);
1381
1382                         m->vmp_busy = TRUE;
1383                         break;
1384                 }
1385
1386
1387                 /*
1388                  * we get here when there is no page present in the object at
1389                  * the offset we're interested in... we'll allocate a page
1390                  * at this point if the pager associated with
1391                  * this object can provide the data or we're the top object...
1392                  * object is locked;  m == NULL
1393                  */
1394
1395                 if (must_be_resident) {
1396                         if (fault_type == VM_PROT_NONE &&
1397                             object == kernel_object) {
1398                                 /*
1399                                  * We've been called from vm_fault_unwire()
1400                                  * while removing a map entry that was allocated
1401                                  * with KMA_KOBJECT and KMA_VAONLY.  This page
1402                                  * is not present and there's nothing more to
1403                                  * do here (nothing to unwire).
1404                                  */
1405                                 vm_fault_cleanup(object, first_m);
1406                                 thread_interrupt_level(interruptible_state);
1407
1408                                 return VM_FAULT_MEMORY_ERROR;
1409                         }
1410
1411                         goto dont_look_for_page;
1412                 }
1413
1414                 /* Don't expect to fault pages into the kernel object. */
1415                 assert(object != kernel_object);
1416
1417                 data_supply = FALSE;
1418
1419                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1420
1421 #if TRACEFAULTPAGE
1422                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1423 #endif
1424                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1425                         /*
1426                          * Allocate a new page for this object/offset pair as a placeholder
1427                          */
1428                         m = vm_page_grab_options(grab_options);
1429 #if TRACEFAULTPAGE
1430                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1431 #endif
1432                         if (m == VM_PAGE_NULL) {
1433                                 vm_fault_cleanup(object, first_m);
1434                                 thread_interrupt_level(interruptible_state);
1435
1436                                 return VM_FAULT_MEMORY_SHORTAGE;
1437                         }
1438
1439                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1440                                 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1441                         } else {
1442                                 vm_page_insert(m, object, offset);
1443                         }
1444                 }
1445                 if (look_for_page) {
1446                         kern_return_t   rc;
1447                         int             my_fault_type;
1448
1449                         /*
1450                          *      If the memory manager is not ready, we
1451                          *      cannot make requests.
1452                          */
1453                         if (!object->pager_ready) {
1454 #if TRACEFAULTPAGE
1455                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1456 #endif
1457                                 if (m != VM_PAGE_NULL) {
1458                                         VM_PAGE_FREE(m);
1459                                 }
1460
1461                                 /*
1462                                  * take an extra ref so object won't die
1463                                  */
1464                                 vm_object_reference_locked(object);
1465                                 vm_fault_cleanup(object, first_m);
1466                                 counter(c_vm_fault_page_block_backoff_kernel++);
1467
1468                                 vm_object_lock(object);
1469                                 assert(object->ref_count > 0);
1470
1471                                 if (!object->pager_ready) {
1472                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1473
1474                                         vm_object_unlock(object);
1475                                         if (wait_result == THREAD_WAITING) {
1476                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1477                                         }
1478                                         vm_object_deallocate(object);
1479
1480                                         goto backoff;
1481                                 } else {
1482                                         vm_object_unlock(object);
1483                                         vm_object_deallocate(object);
1484                                         thread_interrupt_level(interruptible_state);
1485
1486                                         return VM_FAULT_RETRY;
1487                                 }
1488                         }
1489                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1490                                 /*
1491                                  * If there are too many outstanding page
1492                                  * requests pending on this external object, we
1493                                  * wait for them to be resolved now.
1494                                  */
1495 #if TRACEFAULTPAGE
1496                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1497 #endif
1498                                 if (m != VM_PAGE_NULL) {
1499                                         VM_PAGE_FREE(m);
1500                                 }
1501                                 /*
1502                                  * take an extra ref so object won't die
1503                                  */
1504                                 vm_object_reference_locked(object);
1505
1506                                 vm_fault_cleanup(object, first_m);
1507
1508                                 counter(c_vm_fault_page_block_backoff_kernel++);
1509
1510                                 vm_object_lock(object);
1511                                 assert(object->ref_count > 0);
1512
1513                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1514                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1515
1516                                         vm_object_unlock(object);
1517                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1518                                         vm_object_deallocate(object);
1519
1520                                         goto backoff;
1521                                 } else {
1522                                         vm_object_unlock(object);
1523                                         vm_object_deallocate(object);
1524                                         thread_interrupt_level(interruptible_state);
1525
1526                                         return VM_FAULT_RETRY;
1527                                 }
1528                         }
1529                         if (object->internal) {
1530                                 int compressed_count_delta;
1531
1532                                 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1533
1534                                 if (m == VM_PAGE_NULL) {
1535                                         /*
1536                                          * Allocate a new page for this object/offset pair as a placeholder
1537                                          */
1538                                         m = vm_page_grab_options(grab_options);
1539 #if TRACEFAULTPAGE
1540                                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1541 #endif
1542                                         if (m == VM_PAGE_NULL) {
1543                                                 vm_fault_cleanup(object, first_m);
1544                                                 thread_interrupt_level(interruptible_state);
1545
1546                                                 return VM_FAULT_MEMORY_SHORTAGE;
1547                                         }
1548
1549                                         m->vmp_absent = TRUE;
1550                                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1551                                                 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1552                                         } else {
1553                                                 vm_page_insert(m, object, offset);
1554                                         }
1555                                 }
1556                                 assert(m->vmp_busy);
1557
1558                                 m->vmp_absent = TRUE;
1559                                 pager = object->pager;
1560
1561                                 assert(object->paging_in_progress > 0);
1562                                 vm_object_unlock(object);
1563
1564                                 rc = vm_compressor_pager_get(
1565                                         pager,
1566                                         offset + object->paging_offset,
1567                                         VM_PAGE_GET_PHYS_PAGE(m),
1568                                         &my_fault_type,
1569                                         0,
1570                                         &compressed_count_delta);
1571
1572                                 if (type_of_fault == NULL) {
1573                                         int     throttle_delay;
1574
1575                                         /*
1576                                          * we weren't called from vm_fault, so we
1577                                          * need to apply page creation throttling
1578                                          * do it before we re-acquire any locks
1579                                          */
1580                                         if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1581                                                 if ((throttle_delay = vm_page_throttled(TRUE))) {
1582                                                         VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1583                                                         delay(throttle_delay);
1584                                                 }
1585                                         }
1586                                 }
1587                                 vm_object_lock(object);
1588                                 assert(object->paging_in_progress > 0);
1589
1590                                 vm_compressor_pager_count(
1591                                         pager,
1592                                         compressed_count_delta,
1593                                         FALSE, /* shared_lock */
1594                                         object);
1595
1596                                 switch (rc) {
1597                                 case KERN_SUCCESS:
1598                                         m->vmp_absent = FALSE;
1599                                         m->vmp_dirty = TRUE;
1600                                         if ((object->wimg_bits &
1601                                             VM_WIMG_MASK) !=
1602                                             VM_WIMG_USE_DEFAULT) {
1603                                                 /*
1604                                                  * If the page is not cacheable,
1605                                                  * we can't let its contents
1606                                                  * linger in the data cache
1607                                                  * after the decompression.
1608                                                  */
1609                                                 pmap_sync_page_attributes_phys(
1610                                                         VM_PAGE_GET_PHYS_PAGE(m));
1611                                         } else {
1612                                                 m->vmp_written_by_kernel = TRUE;
1613                                         }
1614
1615                                         /*
1616                                          * If the object is purgeable, its
1617                                          * owner's purgeable ledgers have been
1618                                          * updated in vm_page_insert() but the
1619                                          * page was also accounted for in a
1620                                          * "compressed purgeable" ledger, so
1621                                          * update that now.
1622                                          */
1623                                         if (((object->purgable !=
1624                                             VM_PURGABLE_DENY) ||
1625                                             object->vo_ledger_tag) &&
1626                                             (object->vo_owner !=
1627                                             NULL)) {
1628                                                 /*
1629                                                  * One less compressed
1630                                                  * purgeable/tagged page.
1631                                                  */
1632                                                 vm_object_owner_compressed_update(
1633                                                         object,
1634                                                         -1);
1635                                         }
1636
1637                                         break;
1638                                 case KERN_MEMORY_FAILURE:
1639                                         m->vmp_unusual = TRUE;
1640                                         m->vmp_error = TRUE;
1641                                         m->vmp_absent = FALSE;
1642                                         break;
1643                                 case KERN_MEMORY_ERROR:
1644                                         assert(m->vmp_absent);
1645                                         break;
1646                                 default:
1647                                         panic("vm_fault_page(): unexpected "
1648                                             "error %d from "
1649                                             "vm_compressor_pager_get()\n",
1650                                             rc);
1651                                 }
1652                                 PAGE_WAKEUP_DONE(m);
1653
1654                                 rc = KERN_SUCCESS;
1655                                 goto data_requested;
1656                         }
1657                         my_fault_type = DBG_PAGEIN_FAULT;
1658
1659                         if (m != VM_PAGE_NULL) {
1660                                 VM_PAGE_FREE(m);
1661                                 m = VM_PAGE_NULL;
1662                         }
1663
1664 #if TRACEFAULTPAGE
1665                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1666 #endif
1667
1668                         /*
1669                          * It's possible someone called vm_object_destroy while we weren't
1670                          * holding the object lock.  If that has happened, then bail out
1671                          * here.
1672                          */
1673
1674                         pager = object->pager;
1675
1676                         if (pager == MEMORY_OBJECT_NULL) {
1677                                 vm_fault_cleanup(object, first_m);
1678                                 thread_interrupt_level(interruptible_state);
1679                                 return VM_FAULT_MEMORY_ERROR;
1680                         }
1681
1682                         /*
1683                          * We have an absent page in place for the faulting offset,
1684                          * so we can release the object lock.
1685                          */
1686
1687                         if (object->object_is_shared_cache) {
1688                                 set_thread_rwlock_boost();
1689                         }
1690
1691                         vm_object_unlock(object);
1692
1693                         /*
1694                          * If this object uses a copy_call strategy,
1695                          * and we are interested in a copy of this object
1696                          * (having gotten here only by following a
1697                          * shadow chain), then tell the memory manager
1698                          * via a flag added to the desired_access
1699                          * parameter, so that it can detect a race
1700                          * between our walking down the shadow chain
1701                          * and its pushing pages up into a copy of
1702                          * the object that it manages.
1703                          */
1704                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1705                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1706                         } else {
1707                                 wants_copy_flag = VM_PROT_NONE;
1708                         }
1709
1710                         if (object->copy == first_object) {
1711                                 /*
1712                                  * if we issue the memory_object_data_request in
1713                                  * this state, we are subject to a deadlock with
1714                                  * the underlying filesystem if it is trying to
1715                                  * shrink the file resulting in a push of pages
1716                                  * into the copy object...  that push will stall
1717                                  * on the placeholder page, and if the pushing thread
1718                                  * is holding a lock that is required on the pagein
1719                                  * path (such as a truncate lock), we'll deadlock...
1720                                  * to avoid this potential deadlock, we throw away
1721                                  * our placeholder page before calling memory_object_data_request
1722                                  * and force this thread to retry the vm_fault_page after
1723                                  * we have issued the I/O.  the second time through this path
1724                                  * we will find the page already in the cache (presumably still
1725                                  * busy waiting for the I/O to complete) and then complete
1726                                  * the fault w/o having to go through memory_object_data_request again
1727                                  */
1728                                 assert(first_m != VM_PAGE_NULL);
1729                                 assert(VM_PAGE_OBJECT(first_m) == first_object);
1730
1731                                 vm_object_lock(first_object);
1732                                 VM_PAGE_FREE(first_m);
1733                                 vm_object_paging_end(first_object);
1734                                 vm_object_unlock(first_object);
1735
1736                                 first_m = VM_PAGE_NULL;
1737                                 force_fault_retry = TRUE;
1738
1739                                 vm_fault_page_forced_retry++;
1740                         }
1741
1742                         if (data_already_requested == TRUE) {
1743                                 orig_behavior = fault_info->behavior;
1744                                 orig_cluster_size = fault_info->cluster_size;
1745
1746                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1747                                 fault_info->cluster_size = PAGE_SIZE;
1748                         }
1749                         /*
1750                          * Call the memory manager to retrieve the data.
1751                          */
1752                         rc = memory_object_data_request(
1753                                 pager,
1754                                 offset + object->paging_offset,
1755                                 PAGE_SIZE,
1756                                 access_required | wants_copy_flag,
1757                                 (memory_object_fault_info_t)fault_info);
1758
1759                         if (data_already_requested == TRUE) {
1760                                 fault_info->behavior = orig_behavior;
1761                                 fault_info->cluster_size = orig_cluster_size;
1762                         } else {
1763                                 data_already_requested = TRUE;
1764                         }
1765
1766                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1767 #if TRACEFAULTPAGE
1768                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1769 #endif
1770                         vm_object_lock(object);
1771
1772                         if (object->object_is_shared_cache) {
1773                                 clear_thread_rwlock_boost();
1774                         }
1775
1776 data_requested:
1777                         if (rc != KERN_SUCCESS) {
1778                                 vm_fault_cleanup(object, first_m);
1779                                 thread_interrupt_level(interruptible_state);
1780
1781                                 return (rc == MACH_SEND_INTERRUPTED) ?
1782                                        VM_FAULT_INTERRUPTED :
1783                                        VM_FAULT_MEMORY_ERROR;
1784                         } else {
1785                                 clock_sec_t     tv_sec;
1786                                 clock_usec_t    tv_usec;
1787
1788                                 if (my_fault_type == DBG_PAGEIN_FAULT) {
1789                                         clock_get_system_microtime(&tv_sec, &tv_usec);
1790                                         current_thread()->t_page_creation_time = tv_sec;
1791                                         current_thread()->t_page_creation_count = 0;
1792                                 }
1793                         }
1794                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1795                                 vm_fault_cleanup(object, first_m);
1796                                 thread_interrupt_level(interruptible_state);
1797
1798                                 return VM_FAULT_INTERRUPTED;
1799                         }
1800                         if (force_fault_retry == TRUE) {
1801                                 vm_fault_cleanup(object, first_m);
1802                                 thread_interrupt_level(interruptible_state);
1803
1804                                 return VM_FAULT_RETRY;
1805                         }
1806                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1807                                 /*
1808                                  * No page here means that the object we
1809                                  * initially looked up was "physically
1810                                  * contiguous" (i.e. device memory).  However,
1811                                  * with Virtual VRAM, the object might not
1812                                  * be backed by that device memory anymore,
1813                                  * so we're done here only if the object is
1814                                  * still "phys_contiguous".
1815                                  * Otherwise, if the object is no longer
1816                                  * "phys_contiguous", we need to retry the
1817                                  * page fault against the object's new backing
1818                                  * store (different memory object).
1819                                  */
1820 phys_contig_object:
1821                                 goto done;
1822                         }
1823                         /*
1824                          * potentially a pagein fault
1825                          * if we make it through the state checks
1826                          * above, than we'll count it as such
1827                          */
1828                         my_fault = my_fault_type;
1829
1830                         /*
1831                          * Retry with same object/offset, since new data may
1832                          * be in a different page (i.e., m is meaningless at
1833                          * this point).
1834                          */
1835                         continue;
1836                 }
1837 dont_look_for_page:
1838                 /*
1839                  * We get here if the object has no pager, or an existence map
1840                  * exists and indicates the page isn't present on the pager
1841                  * or we're unwiring a page.  If a pager exists, but there
1842                  * is no existence map, then the m->vmp_absent case above handles
1843                  * the ZF case when the pager can't provide the page
1844                  */
1845 #if TRACEFAULTPAGE
1846                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1847 #endif
1848                 if (object == first_object) {
1849                         first_m = m;
1850                 } else {
1851                         assert(m == VM_PAGE_NULL);
1852                 }
1853
1854                 next_object = object->shadow;
1855
1856                 if (next_object == VM_OBJECT_NULL) {
1857                         /*
1858                          * we've hit the bottom of the shadown chain,
1859                          * fill the page in the top object with zeros.
1860                          */
1861                         assert(!must_be_resident);
1862
1863                         if (object != first_object) {
1864                                 vm_object_paging_end(object);
1865                                 vm_object_unlock(object);
1866
1867                                 object = first_object;
1868                                 offset = first_offset;
1869                                 vm_object_lock(object);
1870                         }
1871                         m = first_m;
1872                         assert(VM_PAGE_OBJECT(m) == object);
1873                         first_m = VM_PAGE_NULL;
1874
1875                         /*
1876                          * check for any conditions that prevent
1877                          * us from creating a new zero-fill page
1878                          * vm_fault_check will do all of the
1879                          * fault cleanup in the case of an error condition
1880                          * including resetting the thread_interrupt_level
1881                          */
1882                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1883
1884                         if (error != VM_FAULT_SUCCESS) {
1885                                 return error;
1886                         }
1887
1888                         if (m == VM_PAGE_NULL) {
1889                                 m = vm_page_grab_options(grab_options);
1890
1891                                 if (m == VM_PAGE_NULL) {
1892                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1893                                         thread_interrupt_level(interruptible_state);
1894
1895                                         return VM_FAULT_MEMORY_SHORTAGE;
1896                                 }
1897                                 vm_page_insert(m, object, offset);
1898                         }
1899                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1900                                 m->vmp_absent = TRUE;
1901                         }
1902
1903                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1904
1905                         break;
1906                 } else {
1907                         /*
1908                          * Move on to the next object.  Lock the next
1909                          * object before unlocking the current one.
1910                          */
1911                         if ((object != first_object) || must_be_resident) {
1912                                 vm_object_paging_end(object);
1913                         }
1914
1915                         offset += object->vo_shadow_offset;
1916                         fault_info->lo_offset += object->vo_shadow_offset;
1917                         fault_info->hi_offset += object->vo_shadow_offset;
1918                         access_required = VM_PROT_READ;
1919
1920                         vm_object_lock(next_object);
1921                         vm_object_unlock(object);
1922
1923                         object = next_object;
1924                         vm_object_paging_begin(object);
1925                 }
1926         }
1927
1928         /*
1929          *      PAGE HAS BEEN FOUND.
1930          *
1931          *      This page (m) is:
1932          *              busy, so that we can play with it;
1933          *              not absent, so that nobody else will fill it;
1934          *              possibly eligible for pageout;
1935          *
1936          *      The top-level page (first_m) is:
1937          *              VM_PAGE_NULL if the page was found in the
1938          *               top-level object;
1939          *              busy, not absent, and ineligible for pageout.
1940          *
1941          *      The current object (object) is locked.  A paging
1942          *      reference is held for the current and top-level
1943          *      objects.
1944          */
1945
1946 #if TRACEFAULTPAGE
1947         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1948 #endif
1949 #if     EXTRA_ASSERTIONS
1950         assert(m->vmp_busy && !m->vmp_absent);
1951         assert((first_m == VM_PAGE_NULL) ||
1952             (first_m->vmp_busy && !first_m->vmp_absent &&
1953             !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
1954 #endif  /* EXTRA_ASSERTIONS */
1955
1956         /*
1957          * If the page is being written, but isn't
1958          * already owned by the top-level object,
1959          * we have to copy it into a new page owned
1960          * by the top-level object.
1961          */
1962         if (object != first_object) {
1963 #if TRACEFAULTPAGE
1964                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1965 #endif
1966                 if (fault_type & VM_PROT_WRITE) {
1967                         vm_page_t copy_m;
1968
1969                         /*
1970                          * We only really need to copy if we
1971                          * want to write it.
1972                          */
1973                         assert(!must_be_resident);
1974
1975                         /*
1976                          * If we try to collapse first_object at this
1977                          * point, we may deadlock when we try to get
1978                          * the lock on an intermediate object (since we
1979                          * have the bottom object locked).  We can't
1980                          * unlock the bottom object, because the page
1981                          * we found may move (by collapse) if we do.
1982                          *
1983                          * Instead, we first copy the page.  Then, when
1984                          * we have no more use for the bottom object,
1985                          * we unlock it and try to collapse.
1986                          *
1987                          * Note that we copy the page even if we didn't
1988                          * need to... that's the breaks.
1989                          */
1990
1991                         /*
1992                          * Allocate a page for the copy
1993                          */
1994                         copy_m = vm_page_grab_options(grab_options);
1995
1996                         if (copy_m == VM_PAGE_NULL) {
1997                                 RELEASE_PAGE(m);
1998
1999                                 vm_fault_cleanup(object, first_m);
2000                                 thread_interrupt_level(interruptible_state);
2001
2002                                 return VM_FAULT_MEMORY_SHORTAGE;
2003                         }
2004
2005                         vm_page_copy(m, copy_m);
2006
2007                         /*
2008                          * If another map is truly sharing this
2009                          * page with us, we have to flush all
2010                          * uses of the original page, since we
2011                          * can't distinguish those which want the
2012                          * original from those which need the
2013                          * new copy.
2014                          *
2015                          * XXXO If we know that only one map has
2016                          * access to this page, then we could
2017                          * avoid the pmap_disconnect() call.
2018                          */
2019                         if (m->vmp_pmapped) {
2020                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2021                         }
2022
2023                         if (m->vmp_clustered) {
2024                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2025                                 VM_PAGE_CONSUME_CLUSTERED(m);
2026                         }
2027                         assert(!m->vmp_cleaning);
2028
2029                         /*
2030                          * We no longer need the old page or object.
2031                          */
2032                         RELEASE_PAGE(m);
2033
2034                         /*
2035                          * This check helps with marking the object as having a sequential pattern
2036                          * Normally we'll miss doing this below because this fault is about COW to
2037                          * the first_object i.e. bring page in from disk, push to object above but
2038                          * don't update the file object's sequential pattern.
2039                          */
2040                         if (object->internal == FALSE) {
2041                                 vm_fault_is_sequential(object, offset, fault_info->behavior);
2042                         }
2043
2044                         vm_object_paging_end(object);
2045                         vm_object_unlock(object);
2046
2047                         my_fault = DBG_COW_FAULT;
2048                         VM_STAT_INCR(cow_faults);
2049                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2050                         current_task()->cow_faults++;
2051
2052                         object = first_object;
2053                         offset = first_offset;
2054
2055                         vm_object_lock(object);
2056                         /*
2057                          * get rid of the place holder
2058                          * page that we soldered in earlier
2059                          */
2060                         VM_PAGE_FREE(first_m);
2061                         first_m = VM_PAGE_NULL;
2062
2063                         /*
2064                          * and replace it with the
2065                          * page we just copied into
2066                          */
2067                         assert(copy_m->vmp_busy);
2068                         vm_page_insert(copy_m, object, offset);
2069                         SET_PAGE_DIRTY(copy_m, TRUE);
2070
2071                         m = copy_m;
2072                         /*
2073                          * Now that we've gotten the copy out of the
2074                          * way, let's try to collapse the top object.
2075                          * But we have to play ugly games with
2076                          * paging_in_progress to do that...
2077                          */
2078                         vm_object_paging_end(object);
2079                         vm_object_collapse(object, offset, TRUE);
2080                         vm_object_paging_begin(object);
2081                 } else {
2082                         *protection &= (~VM_PROT_WRITE);
2083                 }
2084         }
2085         /*
2086          * Now check whether the page needs to be pushed into the
2087          * copy object.  The use of asymmetric copy on write for
2088          * shared temporary objects means that we may do two copies to
2089          * satisfy the fault; one above to get the page from a
2090          * shadowed object, and one here to push it into the copy.
2091          */
2092         try_failed_count = 0;
2093
2094         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2095                 vm_object_offset_t      copy_offset;
2096                 vm_page_t               copy_m;
2097
2098 #if TRACEFAULTPAGE
2099                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2100 #endif
2101                 /*
2102                  * If the page is being written, but hasn't been
2103                  * copied to the copy-object, we have to copy it there.
2104                  */
2105                 if ((fault_type & VM_PROT_WRITE) == 0) {
2106                         *protection &= ~VM_PROT_WRITE;
2107                         break;
2108                 }
2109
2110                 /*
2111                  * If the page was guaranteed to be resident,
2112                  * we must have already performed the copy.
2113                  */
2114                 if (must_be_resident) {
2115                         break;
2116                 }
2117
2118                 /*
2119                  * Try to get the lock on the copy_object.
2120                  */
2121                 if (!vm_object_lock_try(copy_object)) {
2122                         vm_object_unlock(object);
2123                         try_failed_count++;
2124
2125                         mutex_pause(try_failed_count);  /* wait a bit */
2126                         vm_object_lock(object);
2127
2128                         continue;
2129                 }
2130                 try_failed_count = 0;
2131
2132                 /*
2133                  * Make another reference to the copy-object,
2134                  * to keep it from disappearing during the
2135                  * copy.
2136                  */
2137                 vm_object_reference_locked(copy_object);
2138
2139                 /*
2140                  * Does the page exist in the copy?
2141                  */
2142                 copy_offset = first_offset - copy_object->vo_shadow_offset;
2143
2144                 if (copy_object->vo_size <= copy_offset) {
2145                         /*
2146                          * Copy object doesn't cover this page -- do nothing.
2147                          */
2148                         ;
2149                 } else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2150                         /*
2151                          * Page currently exists in the copy object
2152                          */
2153                         if (copy_m->vmp_busy) {
2154                                 /*
2155                                  * If the page is being brought
2156                                  * in, wait for it and then retry.
2157                                  */
2158                                 RELEASE_PAGE(m);
2159
2160                                 /*
2161                                  * take an extra ref so object won't die
2162                                  */
2163                                 vm_object_reference_locked(copy_object);
2164                                 vm_object_unlock(copy_object);
2165                                 vm_fault_cleanup(object, first_m);
2166                                 counter(c_vm_fault_page_block_backoff_kernel++);
2167
2168                                 vm_object_lock(copy_object);
2169                                 assert(copy_object->ref_count > 0);
2170                                 VM_OBJ_RES_DECR(copy_object);
2171                                 vm_object_lock_assert_exclusive(copy_object);
2172                                 copy_object->ref_count--;
2173                                 assert(copy_object->ref_count > 0);
2174                                 copy_m = vm_page_lookup(copy_object, copy_offset);
2175
2176                                 if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2177                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
2178
2179                                         vm_object_unlock(copy_object);
2180                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2181                                         vm_object_deallocate(copy_object);
2182
2183                                         goto backoff;
2184                                 } else {
2185                                         vm_object_unlock(copy_object);
2186                                         vm_object_deallocate(copy_object);
2187                                         thread_interrupt_level(interruptible_state);
2188
2189                                         return VM_FAULT_RETRY;
2190                                 }
2191                         }
2192                 } else if (!PAGED_OUT(copy_object, copy_offset)) {
2193                         /*
2194                          * If PAGED_OUT is TRUE, then the page used to exist
2195                          * in the copy-object, and has already been paged out.
2196                          * We don't need to repeat this. If PAGED_OUT is
2197                          * FALSE, then either we don't know (!pager_created,
2198                          * for example) or it hasn't been paged out.
2199                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2200                          * We must copy the page to the copy object.
2201                          *
2202                          * Allocate a page for the copy
2203                          */
2204                         copy_m = vm_page_alloc(copy_object, copy_offset);
2205
2206                         if (copy_m == VM_PAGE_NULL) {
2207                                 RELEASE_PAGE(m);
2208
2209                                 VM_OBJ_RES_DECR(copy_object);
2210                                 vm_object_lock_assert_exclusive(copy_object);
2211                                 copy_object->ref_count--;
2212                                 assert(copy_object->ref_count > 0);
2213
2214                                 vm_object_unlock(copy_object);
2215                                 vm_fault_cleanup(object, first_m);
2216                                 thread_interrupt_level(interruptible_state);
2217
2218                                 return VM_FAULT_MEMORY_SHORTAGE;
2219                         }
2220                         /*
2221                          * Must copy page into copy-object.
2222                          */
2223                         vm_page_copy(m, copy_m);
2224
2225                         /*
2226                          * If the old page was in use by any users
2227                          * of the copy-object, it must be removed
2228                          * from all pmaps.  (We can't know which
2229                          * pmaps use it.)
2230                          */
2231                         if (m->vmp_pmapped) {
2232                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2233                         }
2234
2235                         if (m->vmp_clustered) {
2236                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2237                                 VM_PAGE_CONSUME_CLUSTERED(m);
2238                         }
2239                         /*
2240                          * If there's a pager, then immediately
2241                          * page out this page, using the "initialize"
2242                          * option.  Else, we use the copy.
2243                          */
2244                         if ((!copy_object->pager_ready)
2245                             || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2246                             ) {
2247                                 vm_page_lockspin_queues();
2248                                 assert(!m->vmp_cleaning);
2249                                 vm_page_activate(copy_m);
2250                                 vm_page_unlock_queues();
2251
2252                                 SET_PAGE_DIRTY(copy_m, TRUE);
2253                                 PAGE_WAKEUP_DONE(copy_m);
2254                         } else {
2255                                 assert(copy_m->vmp_busy == TRUE);
2256                                 assert(!m->vmp_cleaning);
2257
2258                                 /*
2259                                  * dirty is protected by the object lock
2260                                  */
2261                                 SET_PAGE_DIRTY(copy_m, TRUE);
2262
2263                                 /*
2264                                  * The page is already ready for pageout:
2265                                  * not on pageout queues and busy.
2266                                  * Unlock everything except the
2267                                  * copy_object itself.
2268                                  */
2269                                 vm_object_unlock(object);
2270
2271                                 /*
2272                                  * Write the page to the copy-object,
2273                                  * flushing it from the kernel.
2274                                  */
2275                                 vm_pageout_initialize_page(copy_m);
2276
2277                                 /*
2278                                  * Since the pageout may have
2279                                  * temporarily dropped the
2280                                  * copy_object's lock, we
2281                                  * check whether we'll have
2282                                  * to deallocate the hard way.
2283                                  */
2284                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2285                                         vm_object_unlock(copy_object);
2286                                         vm_object_deallocate(copy_object);
2287                                         vm_object_lock(object);
2288
2289                                         continue;
2290                                 }
2291                                 /*
2292                                  * Pick back up the old object's
2293                                  * lock.  [It is safe to do so,
2294                                  * since it must be deeper in the
2295                                  * object tree.]
2296                                  */
2297                                 vm_object_lock(object);
2298                         }
2299
2300                         /*
2301                          * Because we're pushing a page upward
2302                          * in the object tree, we must restart
2303                          * any faults that are waiting here.
2304                          * [Note that this is an expansion of
2305                          * PAGE_WAKEUP that uses the THREAD_RESTART
2306                          * wait result].  Can't turn off the page's
2307                          * busy bit because we're not done with it.
2308                          */
2309                         if (m->vmp_wanted) {
2310                                 m->vmp_wanted = FALSE;
2311                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2312                         }
2313                 }
2314                 /*
2315                  * The reference count on copy_object must be
2316                  * at least 2: one for our extra reference,
2317                  * and at least one from the outside world
2318                  * (we checked that when we last locked
2319                  * copy_object).
2320                  */
2321                 vm_object_lock_assert_exclusive(copy_object);
2322                 copy_object->ref_count--;
2323                 assert(copy_object->ref_count > 0);
2324
2325                 VM_OBJ_RES_DECR(copy_object);
2326                 vm_object_unlock(copy_object);
2327
2328                 break;
2329         }
2330
2331 done:
2332         *result_page = m;
2333         *top_page = first_m;
2334
2335         if (m != VM_PAGE_NULL) {
2336                 assert(VM_PAGE_OBJECT(m) == object);
2337
2338                 retval = VM_FAULT_SUCCESS;
2339
2340                 if (my_fault == DBG_PAGEIN_FAULT) {
2341                         VM_PAGE_COUNT_AS_PAGEIN(m);
2342
2343                         if (object->internal) {
2344                                 my_fault = DBG_PAGEIND_FAULT;
2345                         } else {
2346                                 my_fault = DBG_PAGEINV_FAULT;
2347                         }
2348
2349                         /*
2350                          * evaluate access pattern and update state
2351                          * vm_fault_deactivate_behind depends on the
2352                          * state being up to date
2353                          */
2354                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2355                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2356                 } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2357                         /*
2358                          * we weren't called from vm_fault, so handle the
2359                          * accounting here for hits in the cache
2360                          */
2361                         if (m->vmp_clustered) {
2362                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2363                                 VM_PAGE_CONSUME_CLUSTERED(m);
2364                         }
2365                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2366                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2367                 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2368                         VM_STAT_DECOMPRESSIONS();
2369                 }
2370                 if (type_of_fault) {
2371                         *type_of_fault = my_fault;
2372                 }
2373         } else {
2374                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2375                 assert(first_m == VM_PAGE_NULL);
2376                 assert(object == first_object);
2377         }
2378
2379         thread_interrupt_level(interruptible_state);
2380
2381 #if TRACEFAULTPAGE
2382         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2383 #endif
2384         return retval;
2385
2386 backoff:
2387         thread_interrupt_level(interruptible_state);
2388
2389         if (wait_result == THREAD_INTERRUPTED) {
2390                 return VM_FAULT_INTERRUPTED;
2391         }
2392         return VM_FAULT_RETRY;
2393
2394 #undef  RELEASE_PAGE
2395 }
2396
2397
2398
2399 /*
2400  * CODE SIGNING:
2401  * When soft faulting a page, we have to validate the page if:
2402  * 1. the page is being mapped in user space
2403  * 2. the page hasn't already been found to be "tainted"
2404  * 3. the page belongs to a code-signed object
2405  * 4. the page has not been validated yet or has been mapped for write.
2406  */
2407 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page, page_obj)               \
2408         ((pmap) != kernel_pmap /*1*/ &&                                 \
2409          !(page)->vmp_cs_tainted /*2*/ &&                                       \
2410          (page_obj)->code_signed /*3*/ &&                                       \
2411          (!(page)->vmp_cs_validated || (page)->vmp_wpmapped /*4*/ ))
2412
2413
2414 /*
2415  * page queue lock must NOT be held
2416  * m->vmp_object must be locked
2417  *
2418  * NOTE: m->vmp_object could be locked "shared" only if we are called
2419  * from vm_fault() as part of a soft fault.  If so, we must be
2420  * careful not to modify the VM object in any way that is not
2421  * legal under a shared lock...
2422  */
2423 extern int panic_on_cs_killed;
2424 extern int proc_selfpid(void);
2425 extern char *proc_name_address(void *p);
2426 unsigned long cs_enter_tainted_rejected = 0;
2427 unsigned long cs_enter_tainted_accepted = 0;
2428 kern_return_t
2429 vm_fault_enter(vm_page_t m,
2430     pmap_t pmap,
2431     vm_map_offset_t vaddr,
2432     vm_prot_t prot,
2433     vm_prot_t caller_prot,
2434     boolean_t wired,
2435     boolean_t change_wiring,
2436     vm_tag_t  wire_tag,
2437     vm_object_fault_info_t fault_info,
2438     boolean_t *need_retry,
2439     int *type_of_fault)
2440 {
2441         kern_return_t   kr, pe_result;
2442         boolean_t       previously_pmapped = m->vmp_pmapped;
2443         boolean_t       must_disconnect = 0;
2444         boolean_t       map_is_switched, map_is_switch_protected;
2445         boolean_t       cs_violation;
2446         int             cs_enforcement_enabled;
2447         vm_prot_t       fault_type;
2448         vm_object_t     object;
2449         boolean_t       no_cache = fault_info->no_cache;
2450         boolean_t       cs_bypass = fault_info->cs_bypass;
2451         int             pmap_options = fault_info->pmap_options;
2452
2453         fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
2454         object = VM_PAGE_OBJECT(m);
2455
2456         vm_object_lock_assert_held(object);
2457
2458 #if KASAN
2459         if (pmap == kernel_pmap) {
2460                 kasan_notify_address(vaddr, PAGE_SIZE);
2461         }
2462 #endif
2463
2464         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2465
2466         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
2467                 assert(m->vmp_fictitious);
2468                 return KERN_SUCCESS;
2469         }
2470
2471         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2472                 vm_object_lock_assert_exclusive(object);
2473         } else if ((fault_type & VM_PROT_WRITE) == 0 &&
2474             (!m->vmp_wpmapped
2475 #if VM_OBJECT_ACCESS_TRACKING
2476             || object->access_tracking
2477 #endif /* VM_OBJECT_ACCESS_TRACKING */
2478             )) {
2479                 /*
2480                  * This is not a "write" fault, so we
2481                  * might not have taken the object lock
2482                  * exclusively and we might not be able
2483                  * to update the "wpmapped" bit in
2484                  * vm_fault_enter().
2485                  * Let's just grant read access to
2486                  * the page for now and we'll
2487                  * soft-fault again if we need write
2488                  * access later...
2489                  */
2490
2491                 /* This had better not be a JIT page. */
2492                 if (!pmap_has_prot_policy(prot)) {
2493                         prot &= ~VM_PROT_WRITE;
2494                 } else {
2495                         assert(cs_bypass);
2496                 }
2497         }
2498         if (m->vmp_pmapped == FALSE) {
2499                 if (m->vmp_clustered) {
2500                         if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
2501                                 /*
2502                                  * found it in the cache, but this
2503                                  * is the first fault-in of the page (m->vmp_pmapped == FALSE)
2504                                  * so it must have come in as part of
2505                                  * a cluster... account 1 pagein against it
2506                                  */
2507                                 if (object->internal) {
2508                                         *type_of_fault = DBG_PAGEIND_FAULT;
2509                                 } else {
2510                                         *type_of_fault = DBG_PAGEINV_FAULT;
2511                                 }
2512
2513                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2514                         }
2515                         VM_PAGE_CONSUME_CLUSTERED(m);
2516                 }
2517         }
2518
2519         if (*type_of_fault != DBG_COW_FAULT) {
2520                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2521
2522                 if (pmap == kernel_pmap) {
2523                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2524                 }
2525         }
2526
2527         /* Validate code signature if necessary. */
2528         if (!cs_bypass &&
2529             VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) {
2530                 vm_object_lock_assert_exclusive(object);
2531
2532                 if (m->vmp_cs_validated) {
2533                         vm_cs_revalidates++;
2534                 }
2535
2536                 /* VM map is locked, so 1 ref will remain on VM object -
2537                  * so no harm if vm_page_validate_cs drops the object lock */
2538
2539 #if PMAP_CS
2540                 if (fault_info->pmap_cs_associated &&
2541                     pmap_cs_enforced(pmap) &&
2542                     !m->vmp_cs_validated &&
2543                     !m->vmp_cs_tainted &&
2544                     !m->vmp_cs_nx &&
2545                     (prot & VM_PROT_EXECUTE) &&
2546                     (caller_prot & VM_PROT_EXECUTE)) {
2547                         /*
2548                          * With pmap_cs, the pmap layer will validate the
2549                          * code signature for any executable pmap mapping.
2550                          * No need for us to validate this page too:
2551                          * in pmap_cs we trust...
2552                          */
2553                         vm_cs_defer_to_pmap_cs++;
2554                 } else {
2555                         vm_cs_defer_to_pmap_cs_not++;
2556                         vm_page_validate_cs(m);
2557                 }
2558 #else /* PMAP_CS */
2559                 vm_page_validate_cs(m);
2560 #endif /* PMAP_CS */
2561         }
2562
2563 #define page_immutable(m, prot) ((m)->vmp_cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/ )
2564 #define page_nx(m) ((m)->vmp_cs_nx)
2565
2566         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2567             (pmap == vm_map_pmap(current_thread()->map)));
2568         map_is_switch_protected = current_thread()->map->switch_protect;
2569
2570         /* If the map is switched, and is switch-protected, we must protect
2571          * some pages from being write-faulted: immutable pages because by
2572          * definition they may not be written, and executable pages because that
2573          * would provide a way to inject unsigned code.
2574          * If the page is immutable, we can simply return. However, we can't
2575          * immediately determine whether a page is executable anywhere. But,
2576          * we can disconnect it everywhere and remove the executable protection
2577          * from the current map. We do that below right before we do the
2578          * PMAP_ENTER.
2579          */
2580         cs_enforcement_enabled = cs_process_enforcement(NULL);
2581
2582         if (cs_enforcement_enabled && map_is_switched &&
2583             map_is_switch_protected && page_immutable(m, prot) &&
2584             (prot & VM_PROT_WRITE)) {
2585                 return KERN_CODESIGN_ERROR;
2586         }
2587
2588         if (cs_enforcement_enabled && page_nx(m) && (prot & VM_PROT_EXECUTE)) {
2589                 if (cs_debug) {
2590                         printf("page marked to be NX, not letting it be mapped EXEC\n");
2591                 }
2592                 return KERN_CODESIGN_ERROR;
2593         }
2594
2595         /* A page could be tainted, or pose a risk of being tainted later.
2596          * Check whether the receiving process wants it, and make it feel
2597          * the consequences (that hapens in cs_invalid_page()).
2598          * For CS Enforcement, two other conditions will
2599          * cause that page to be tainted as well:
2600          * - pmapping an unsigned page executable - this means unsigned code;
2601          * - writeable mapping of a validated page - the content of that page
2602          *   can be changed without the kernel noticing, therefore unsigned
2603          *   code can be created
2604          */
2605         if (cs_bypass) {
2606                 /* code-signing is bypassed */
2607                 cs_violation = FALSE;
2608         } else if (m->vmp_cs_tainted) {
2609                 /* tainted page */
2610                 cs_violation = TRUE;
2611         } else if (!cs_enforcement_enabled) {
2612                 /* no further code-signing enforcement */
2613                 cs_violation = FALSE;
2614         } else if (page_immutable(m, prot) &&
2615             ((prot & VM_PROT_WRITE) ||
2616             m->vmp_wpmapped)) {
2617                 /*
2618                  * The page should be immutable, but is in danger of being
2619                  * modified.
2620                  * This is the case where we want policy from the code
2621                  * directory - is the page immutable or not? For now we have
2622                  * to assume that code pages will be immutable, data pages not.
2623                  * We'll assume a page is a code page if it has a code directory
2624                  * and we fault for execution.
2625                  * That is good enough since if we faulted the code page for
2626                  * writing in another map before, it is wpmapped; if we fault
2627                  * it for writing in this map later it will also be faulted for
2628                  * executing at the same time; and if we fault for writing in
2629                  * another map later, we will disconnect it from this pmap so
2630                  * we'll notice the change.
2631                  */
2632                 cs_violation = TRUE;
2633         } else if (!m->vmp_cs_validated &&
2634             (prot & VM_PROT_EXECUTE)
2635 #if PMAP_CS
2636             /*
2637              * Executable pages will be validated by pmap_cs;
2638              * in pmap_cs we trust...
2639              * If pmap_cs is turned off, this is a code-signing
2640              * violation.
2641              */
2642             && !(pmap_cs_enforced(pmap))
2643 #endif /* PMAP_CS */
2644             ) {
2645                 cs_violation = TRUE;
2646         } else {
2647                 cs_violation = FALSE;
2648         }
2649
2650         if (cs_violation) {
2651                 /* We will have a tainted page. Have to handle the special case
2652                  * of a switched map now. If the map is not switched, standard
2653                  * procedure applies - call cs_invalid_page().
2654                  * If the map is switched, the real owner is invalid already.
2655                  * There is no point in invalidating the switching process since
2656                  * it will not be executing from the map. So we don't call
2657                  * cs_invalid_page() in that case. */
2658                 boolean_t reject_page, cs_killed;
2659                 if (map_is_switched) {
2660                         assert(pmap == vm_map_pmap(current_thread()->map));
2661                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2662                         reject_page = FALSE;
2663                 } else {
2664                         if (cs_debug > 5) {
2665                                 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2666                                     object->code_signed ? "yes" : "no",
2667                                     m->vmp_cs_validated ? "yes" : "no",
2668                                     m->vmp_cs_tainted ? "yes" : "no",
2669                                     m->vmp_wpmapped ? "yes" : "no",
2670                                     (int)prot);
2671                         }
2672                         reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2673                 }
2674
2675                 if (reject_page) {
2676                         /* reject the invalid page: abort the page fault */
2677                         int                     pid;
2678                         const char              *procname;
2679                         task_t                  task;
2680                         vm_object_t             file_object, shadow;
2681                         vm_object_offset_t      file_offset;
2682                         char                    *pathname, *filename;
2683                         vm_size_t               pathname_len, filename_len;
2684                         boolean_t               truncated_path;
2685 #define __PATH_MAX 1024
2686                         struct timespec         mtime, cs_mtime;
2687                         int                     shadow_depth;
2688                         os_reason_t             codesigning_exit_reason = OS_REASON_NULL;
2689
2690                         kr = KERN_CODESIGN_ERROR;
2691                         cs_enter_tainted_rejected++;
2692
2693                         /* get process name and pid */
2694                         procname = "?";
2695                         task = current_task();
2696                         pid = proc_selfpid();
2697                         if (task->bsd_info != NULL) {
2698                                 procname = proc_name_address(task->bsd_info);
2699                         }
2700
2701                         /* get file's VM object */
2702                         file_object = object;
2703                         file_offset = m->vmp_offset;
2704                         for (shadow = file_object->shadow,
2705                             shadow_depth = 0;
2706                             shadow != VM_OBJECT_NULL;
2707                             shadow = file_object->shadow,
2708                             shadow_depth++) {
2709                                 vm_object_lock_shared(shadow);
2710                                 if (file_object != object) {
2711                                         vm_object_unlock(file_object);
2712                                 }
2713                                 file_offset += file_object->vo_shadow_offset;
2714                                 file_object = shadow;
2715                         }
2716
2717                         mtime.tv_sec = 0;
2718                         mtime.tv_nsec = 0;
2719                         cs_mtime.tv_sec = 0;
2720                         cs_mtime.tv_nsec = 0;
2721
2722                         /* get file's pathname and/or filename */
2723                         pathname = NULL;
2724                         filename = NULL;
2725                         pathname_len = 0;
2726                         filename_len = 0;
2727                         truncated_path = FALSE;
2728                         /* no pager -> no file -> no pathname, use "<nil>" in that case */
2729                         if (file_object->pager != NULL) {
2730                                 pathname = (char *)kalloc(__PATH_MAX * 2);
2731                                 if (pathname) {
2732                                         pathname[0] = '\0';
2733                                         pathname_len = __PATH_MAX;
2734                                         filename = pathname + pathname_len;
2735                                         filename_len = __PATH_MAX;
2736
2737                                         if (vnode_pager_get_object_name(file_object->pager,
2738                                             pathname,
2739                                             pathname_len,
2740                                             filename,
2741                                             filename_len,
2742                                             &truncated_path) == KERN_SUCCESS) {
2743                                                 /* safety first... */
2744                                                 pathname[__PATH_MAX - 1] = '\0';
2745                                                 filename[__PATH_MAX - 1] = '\0';
2746
2747                                                 vnode_pager_get_object_mtime(file_object->pager,
2748                                                     &mtime,
2749                                                     &cs_mtime);
2750                                         } else {
2751                                                 kfree(pathname, __PATH_MAX * 2);
2752                                                 pathname = NULL;
2753                                                 filename = NULL;
2754                                                 pathname_len = 0;
2755                                                 filename_len = 0;
2756                                                 truncated_path = FALSE;
2757                                         }
2758                                 }
2759                         }
2760                         printf("CODE SIGNING: process %d[%s]: "
2761                             "rejecting invalid page at address 0x%llx "
2762                             "from offset 0x%llx in file \"%s%s%s\" "
2763                             "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2764                             "(signed:%d validated:%d tainted:%d nx:%d "
2765                             "wpmapped:%d dirty:%d depth:%d)\n",
2766                             pid, procname, (addr64_t) vaddr,
2767                             file_offset,
2768                             (pathname ? pathname : "<nil>"),
2769                             (truncated_path ? "/.../" : ""),
2770                             (truncated_path ? filename : ""),
2771                             cs_mtime.tv_sec, cs_mtime.tv_nsec,
2772                             ((cs_mtime.tv_sec == mtime.tv_sec &&
2773                             cs_mtime.tv_nsec == mtime.tv_nsec)
2774                             ? "=="
2775                             : "!="),
2776                             mtime.tv_sec, mtime.tv_nsec,
2777                             object->code_signed,
2778                             m->vmp_cs_validated,
2779                             m->vmp_cs_tainted,
2780                             m->vmp_cs_nx,
2781                             m->vmp_wpmapped,
2782                             m->vmp_dirty,
2783                             shadow_depth);
2784
2785                         /*
2786                          * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2787                          * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2788                          * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2789                          * will deal with the segmentation fault.
2790                          */
2791                         if (cs_killed) {
2792                                 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2793                                     pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
2794
2795                                 codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2796                                 if (codesigning_exit_reason == NULL) {
2797                                         printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2798                                 } else {
2799                                         mach_vm_address_t data_addr = 0;
2800                                         struct codesigning_exit_reason_info *ceri = NULL;
2801                                         uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2802
2803                                         if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2804                                                 printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2805                                         } else {
2806                                                 if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2807                                                     EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2808                                                         ceri = (struct codesigning_exit_reason_info *)data_addr;
2809                                                         static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2810
2811                                                         ceri->ceri_virt_addr = vaddr;
2812                                                         ceri->ceri_file_offset = file_offset;
2813                                                         if (pathname) {
2814                                                                 strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2815                                                         } else {
2816                                                                 ceri->ceri_pathname[0] = '\0';
2817                                                         }
2818                                                         if (filename) {
2819                                                                 strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2820                                                         } else {
2821                                                                 ceri->ceri_filename[0] = '\0';
2822                                                         }
2823                                                         ceri->ceri_path_truncated = (truncated_path);
2824                                                         ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2825                                                         ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2826                                                         ceri->ceri_page_modtime_secs = mtime.tv_sec;
2827                                                         ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2828                                                         ceri->ceri_object_codesigned = (object->code_signed);
2829                                                         ceri->ceri_page_codesig_validated = (m->vmp_cs_validated);
2830                                                         ceri->ceri_page_codesig_tainted = (m->vmp_cs_tainted);
2831                                                         ceri->ceri_page_codesig_nx = (m->vmp_cs_nx);
2832                                                         ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2833                                                         ceri->ceri_page_slid = 0;
2834                                                         ceri->ceri_page_dirty = (m->vmp_dirty);
2835                                                         ceri->ceri_page_shadow_depth = shadow_depth;
2836                                                 } else {
2837 #if DEBUG || DEVELOPMENT
2838                                                         panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2839 #else
2840                                                         printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2841 #endif /* DEBUG || DEVELOPMENT */
2842                                                         /* Free the buffer */
2843                                                         os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2844                                                 }
2845                                         }
2846                                 }
2847
2848                                 set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2849                         }
2850                         if (panic_on_cs_killed &&
2851                             object->object_is_shared_cache) {
2852                                 char *tainted_contents;
2853                                 vm_map_offset_t src_vaddr;
2854                                 src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
2855                                 tainted_contents = kalloc(PAGE_SIZE);
2856                                 bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
2857                                 printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
2858                                 panic("CODE SIGNING: process %d[%s]: "
2859                                     "rejecting invalid page (phys#0x%x) at address 0x%llx "
2860                                     "from offset 0x%llx in file \"%s%s%s\" "
2861                                     "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2862                                     "(signed:%d validated:%d tainted:%d nx:%d"
2863                                     "wpmapped:%d dirty:%d depth:%d)\n",
2864                                     pid, procname,
2865                                     VM_PAGE_GET_PHYS_PAGE(m),
2866                                     (addr64_t) vaddr,
2867                                     file_offset,
2868                                     (pathname ? pathname : "<nil>"),
2869                                     (truncated_path ? "/.../" : ""),
2870                                     (truncated_path ? filename : ""),
2871                                     cs_mtime.tv_sec, cs_mtime.tv_nsec,
2872                                     ((cs_mtime.tv_sec == mtime.tv_sec &&
2873                                     cs_mtime.tv_nsec == mtime.tv_nsec)
2874                                     ? "=="
2875                                     : "!="),
2876                                     mtime.tv_sec, mtime.tv_nsec,
2877                                     object->code_signed,
2878                                     m->vmp_cs_validated,
2879                                     m->vmp_cs_tainted,
2880                                     m->vmp_cs_nx,
2881                                     m->vmp_wpmapped,
2882                                     m->vmp_dirty,
2883                                     shadow_depth);
2884                         }
2885
2886                         if (file_object != object) {
2887                                 vm_object_unlock(file_object);
2888                         }
2889                         if (pathname_len != 0) {
2890                                 kfree(pathname, __PATH_MAX * 2);
2891                                 pathname = NULL;
2892                                 filename = NULL;
2893                         }
2894                 } else {
2895                         /* proceed with the invalid page */
2896                         kr = KERN_SUCCESS;
2897                         if (!m->vmp_cs_validated &&
2898                             !object->code_signed) {
2899                                 /*
2900                                  * This page has not been (fully) validated but
2901                                  * does not belong to a code-signed object
2902                                  * so it should not be forcefully considered
2903                                  * as tainted.
2904                                  * We're just concerned about it here because
2905                                  * we've been asked to "execute" it but that
2906                                  * does not mean that it should cause other
2907                                  * accesses to fail.
2908                                  * This happens when a debugger sets a
2909                                  * breakpoint and we then execute code in
2910                                  * that page.  Marking the page as "tainted"
2911                                  * would cause any inspection tool ("leaks",
2912                                  * "vmmap", "CrashReporter", ...) to get killed
2913                                  * due to code-signing violation on that page,
2914                                  * even though they're just reading it and not
2915                                  * executing from it.
2916                                  */
2917                         } else {
2918                                 /*
2919                                  * Page might have been tainted before or not;
2920                                  * now it definitively is. If the page wasn't
2921                                  * tainted, we must disconnect it from all
2922                                  * pmaps later, to force existing mappings
2923                                  * through that code path for re-consideration
2924                                  * of the validity of that page.
2925                                  */
2926                                 must_disconnect = !m->vmp_cs_tainted;
2927                                 m->vmp_cs_tainted = TRUE;
2928                         }
2929                         cs_enter_tainted_accepted++;
2930                 }
2931                 if (kr != KERN_SUCCESS) {
2932                         if (cs_debug) {
2933                                 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2934                                     "*** INVALID PAGE ***\n",
2935                                     (long long)vaddr);
2936                         }
2937 #if !SECURE_KERNEL
2938                         if (cs_enforcement_panic) {
2939                                 panic("CODESIGNING: panicking on invalid page\n");
2940                         }
2941 #endif
2942                 }
2943         } else {
2944                 /* proceed with the valid page */
2945                 kr = KERN_SUCCESS;
2946         }
2947
2948         boolean_t       page_queues_locked = FALSE;
2949 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
2950 MACRO_BEGIN                                     \
2951         if (! page_queues_locked) {             \
2952                 page_queues_locked = TRUE;      \
2953                 vm_page_lockspin_queues();      \
2954         }                                       \
2955 MACRO_END
2956 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
2957 MACRO_BEGIN                                     \
2958         if (page_queues_locked) {               \
2959                 page_queues_locked = FALSE;     \
2960                 vm_page_unlock_queues();        \
2961         }                                       \
2962 MACRO_END
2963
2964         /*
2965          * Hold queues lock to manipulate
2966          * the page queues.  Change wiring
2967          * case is obvious.
2968          */
2969         assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
2970
2971 #if CONFIG_BACKGROUND_QUEUE
2972         vm_page_update_background_state(m);
2973 #endif
2974         if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2975                 /*
2976                  * Compressor pages are neither wired
2977                  * nor pageable and should never change.
2978                  */
2979                 assert(object == compressor_object);
2980         } else if (change_wiring) {
2981                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2982
2983                 if (wired) {
2984                         if (kr == KERN_SUCCESS) {
2985                                 vm_page_wire(m, wire_tag, TRUE);
2986                         }
2987                 } else {
2988                         vm_page_unwire(m, TRUE);
2989                 }
2990                 /* we keep the page queues lock, if we need it later */
2991         } else {
2992                 if (object->internal == TRUE) {
2993                         /*
2994                          * don't allow anonymous pages on
2995                          * the speculative queues
2996                          */
2997                         no_cache = FALSE;
2998                 }
2999                 if (kr != KERN_SUCCESS) {
3000                         __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3001                         vm_page_deactivate(m);
3002                         /* we keep the page queues lock, if we need it later */
3003                 } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3004                     (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3005                     (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3006                     ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3007                     !VM_PAGE_WIRED(m)) {
3008                         if (vm_page_local_q &&
3009                             (*type_of_fault == DBG_COW_FAULT ||
3010                             *type_of_fault == DBG_ZERO_FILL_FAULT)) {
3011                                 struct vpl      *lq;
3012                                 uint32_t        lid;
3013
3014                                 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3015
3016                                 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3017                                 vm_object_lock_assert_exclusive(object);
3018
3019                                 /*
3020                                  * we got a local queue to stuff this
3021                                  * new page on...
3022                                  * its safe to manipulate local and
3023                                  * local_id at this point since we're
3024                                  * behind an exclusive object lock and
3025                                  * the page is not on any global queue.
3026                                  *
3027                                  * we'll use the current cpu number to
3028                                  * select the queue note that we don't
3029                                  * need to disable preemption... we're
3030                                  * going to be behind the local queue's
3031                                  * lock to do the real work
3032                                  */
3033                                 lid = cpu_number();
3034
3035                                 lq = &vm_page_local_q[lid].vpl_un.vpl;
3036
3037                                 VPL_LOCK(&lq->vpl_lock);
3038
3039                                 vm_page_check_pageable_safe(m);
3040                                 vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3041                                 m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3042                                 m->vmp_local_id = lid;
3043                                 lq->vpl_count++;
3044
3045                                 if (object->internal) {
3046                                         lq->vpl_internal_count++;
3047                                 } else {
3048                                         lq->vpl_external_count++;
3049                                 }
3050
3051                                 VPL_UNLOCK(&lq->vpl_lock);
3052
3053                                 if (lq->vpl_count > vm_page_local_q_soft_limit) {
3054                                         /*
3055                                          * we're beyond the soft limit
3056                                          * for the local queue
3057                                          * vm_page_reactivate_local will
3058                                          * 'try' to take the global page
3059                                          * queue lock... if it can't
3060                                          * that's ok... we'll let the
3061                                          * queue continue to grow up
3062                                          * to the hard limit... at that
3063                                          * point we'll wait for the
3064                                          * lock... once we've got the
3065                                          * lock, we'll transfer all of
3066                                          * the pages from the local
3067                                          * queue to the global active
3068                                          * queue
3069                                          */
3070                                         vm_page_reactivate_local(lid, FALSE, FALSE);
3071                                 }
3072                         } else {
3073                                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3074
3075                                 /*
3076                                  * test again now that we hold the
3077                                  * page queue lock
3078                                  */
3079                                 if (!VM_PAGE_WIRED(m)) {
3080                                         if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3081                                                 vm_page_queues_remove(m, FALSE);
3082
3083                                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3084                                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3085                                         }
3086
3087                                         if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3088                                             no_cache) {
3089                                                 /*
3090                                                  * If this is a no_cache mapping
3091                                                  * and the page has never been
3092                                                  * mapped before or was
3093                                                  * previously a no_cache page,
3094                                                  * then we want to leave pages
3095                                                  * in the speculative state so
3096                                                  * that they can be readily
3097                                                  * recycled if free memory runs
3098                                                  * low.  Otherwise the page is
3099                                                  * activated as normal.
3100                                                  */
3101
3102                                                 if (no_cache &&
3103                                                     (!previously_pmapped ||
3104                                                     m->vmp_no_cache)) {
3105                                                         m->vmp_no_cache = TRUE;
3106
3107                                                         if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3108                                                                 vm_page_speculate(m, FALSE);
3109                                                         }
3110                                                 } else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3111                                                         vm_page_activate(m);
3112                                                 }
3113                                         }
3114                                 }
3115                                 /* we keep the page queues lock, if we need it later */
3116                         }
3117                 }
3118         }
3119         /* we're done with the page queues lock, if we ever took it */
3120         __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3121
3122
3123         /* If we have a KERN_SUCCESS from the previous checks, we either have
3124          * a good page, or a tainted page that has been accepted by the process.
3125          * In both cases the page will be entered into the pmap.
3126          * If the page is writeable, we need to disconnect it from other pmaps
3127          * now so those processes can take note.
3128          */
3129         if (kr == KERN_SUCCESS) {
3130                 /*
3131                  * NOTE: we may only hold the vm_object lock SHARED
3132                  * at this point, so we need the phys_page lock to
3133                  * properly serialize updating the pmapped and
3134                  * xpmapped bits
3135                  */
3136                 if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3137                         ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3138
3139                         pmap_lock_phys_page(phys_page);
3140                         /*
3141                          * go ahead and take the opportunity
3142                          * to set 'pmapped' here so that we don't
3143                          * need to grab this lock a 2nd time
3144                          * just below
3145                          */
3146                         m->vmp_pmapped = TRUE;
3147
3148                         if (!m->vmp_xpmapped) {
3149                                 m->vmp_xpmapped = TRUE;
3150
3151                                 pmap_unlock_phys_page(phys_page);
3152
3153                                 if (!object->internal) {
3154                                         OSAddAtomic(1, &vm_page_xpmapped_external_count);
3155                                 }
3156
3157 #if defined(__arm__) || defined(__arm64__)
3158                                 pmap_sync_page_data_phys(phys_page);
3159 #else
3160                                 if (object->internal &&
3161                                     object->pager != NULL) {
3162                                         /*
3163                                          * This page could have been
3164                                          * uncompressed by the
3165                                          * compressor pager and its
3166                                          * contents might be only in
3167                                          * the data cache.
3168                                          * Since it's being mapped for
3169                                          * "execute" for the fist time,
3170                                          * make sure the icache is in
3171                                          * sync.
3172                                          */
3173                                         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3174                                         pmap_sync_page_data_phys(phys_page);
3175                                 }
3176 #endif
3177                         } else {
3178                                 pmap_unlock_phys_page(phys_page);
3179                         }
3180                 } else {
3181                         if (m->vmp_pmapped == FALSE) {
3182                                 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3183
3184                                 pmap_lock_phys_page(phys_page);
3185                                 m->vmp_pmapped = TRUE;
3186                                 pmap_unlock_phys_page(phys_page);
3187                         }
3188                 }
3189
3190                 if (fault_type & VM_PROT_WRITE) {
3191                         if (m->vmp_wpmapped == FALSE) {
3192                                 vm_object_lock_assert_exclusive(object);
3193                                 if (!object->internal && object->pager) {
3194                                         task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3195                                 }
3196                                 m->vmp_wpmapped = TRUE;
3197                         }
3198                         if (must_disconnect) {
3199                                 /*
3200                                  * We can only get here
3201                                  * because of the CSE logic
3202                                  */
3203                                 assert(cs_enforcement_enabled);
3204                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3205                                 /*
3206                                  * If we are faulting for a write, we can clear
3207                                  * the execute bit - that will ensure the page is
3208                                  * checked again before being executable, which
3209                                  * protects against a map switch.
3210                                  * This only happens the first time the page
3211                                  * gets tainted, so we won't get stuck here
3212                                  * to make an already writeable page executable.
3213                                  */
3214                                 if (!cs_bypass) {
3215                                         assert(!pmap_has_prot_policy(prot));
3216                                         prot &= ~VM_PROT_EXECUTE;
3217                                 }
3218                         }
3219                 }
3220                 assert(VM_PAGE_OBJECT(m) == object);
3221
3222 #if VM_OBJECT_ACCESS_TRACKING
3223                 if (object->access_tracking) {
3224                         DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3225                         if (fault_type & VM_PROT_WRITE) {
3226                                 object->access_tracking_writes++;
3227                                 vm_object_access_tracking_writes++;
3228                         } else {
3229                                 object->access_tracking_reads++;
3230                                 vm_object_access_tracking_reads++;
3231                         }
3232                 }
3233 #endif /* VM_OBJECT_ACCESS_TRACKING */
3234
3235
3236 #if PMAP_CS
3237 pmap_enter_retry:
3238 #endif
3239                 /* Prevent a deadlock by not
3240                  * holding the object lock if we need to wait for a page in
3241                  * pmap_enter() - <rdar://problem/7138958> */
3242                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3243                     wired,
3244                     pmap_options | PMAP_OPTIONS_NOWAIT,
3245                     pe_result);
3246 #if PMAP_CS
3247                 /*
3248                  * Retry without execute permission if we encountered a codesigning
3249                  * failure on a non-execute fault.  This allows applications which
3250                  * don't actually need to execute code to still map it for read access.
3251                  */
3252                 if ((pe_result == KERN_CODESIGN_ERROR) && pmap_cs_enforced(pmap) &&
3253                     (prot & VM_PROT_EXECUTE) && !(caller_prot & VM_PROT_EXECUTE)) {
3254                         prot &= ~VM_PROT_EXECUTE;
3255                         goto pmap_enter_retry;
3256                 }
3257 #endif
3258 #if __x86_64__
3259                 if (pe_result == KERN_INVALID_ARGUMENT &&
3260                     pmap == PMAP_NULL &&
3261                     wired) {
3262                         /*
3263                          * Wiring a page in a pmap-less VM map:
3264                          * VMware's "vmmon" kernel extension does this
3265                          * to grab pages.
3266                          * Let it proceed even though the PMAP_ENTER() failed.
3267                          */
3268                         pe_result = KERN_SUCCESS;
3269                 }
3270 #endif /* __x86_64__ */
3271
3272                 if (pe_result == KERN_RESOURCE_SHORTAGE) {
3273                         if (need_retry) {
3274                                 /*
3275                                  * this will be non-null in the case where we hold the lock
3276                                  * on the top-object in this chain... we can't just drop
3277                                  * the lock on the object we're inserting the page into
3278                                  * and recall the PMAP_ENTER since we can still cause
3279                                  * a deadlock if one of the critical paths tries to
3280                                  * acquire the lock on the top-object and we're blocked
3281                                  * in PMAP_ENTER waiting for memory... our only recourse
3282                                  * is to deal with it at a higher level where we can
3283                                  * drop both locks.
3284                                  */
3285                                 *need_retry = TRUE;
3286                                 vm_pmap_enter_retried++;
3287                                 goto after_the_pmap_enter;
3288                         }
3289                         /* The nonblocking version of pmap_enter did not succeed.
3290                          * and we don't need to drop other locks and retry
3291                          * at the level above us, so
3292                          * use the blocking version instead. Requires marking
3293                          * the page busy and unlocking the object */
3294                         boolean_t was_busy = m->vmp_busy;
3295
3296                         vm_object_lock_assert_exclusive(object);
3297
3298                         m->vmp_busy = TRUE;
3299                         vm_object_unlock(object);
3300
3301                         PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
3302                             0, wired,
3303                             pmap_options, pe_result);
3304
3305                         assert(VM_PAGE_OBJECT(m) == object);
3306
3307                         /* Take the object lock again. */
3308                         vm_object_lock(object);
3309
3310                         /* If the page was busy, someone else will wake it up.
3311                          * Otherwise, we have to do it now. */
3312                         assert(m->vmp_busy);
3313                         if (!was_busy) {
3314                                 PAGE_WAKEUP_DONE(m);
3315                         }
3316                         vm_pmap_enter_blocked++;
3317                 }
3318
3319                 kr = pe_result;
3320         }
3321
3322 after_the_pmap_enter:
3323         return kr;
3324 }
3325
3326 void
3327 vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
3328 {
3329         if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3330                 vm_fault(current_map(),      /* map */
3331                     vaddr,                   /* vaddr */
3332                     prot,                    /* fault_type */
3333                     FALSE,                   /* change_wiring */
3334                     VM_KERN_MEMORY_NONE,     /* tag - not wiring */
3335                     THREAD_UNINT,            /* interruptible */
3336                     NULL,                    /* caller_pmap */
3337                     0 /* caller_pmap_addr */);
3338         }
3339 }
3340
3341
3342 /*
3343  *      Routine:        vm_fault
3344  *      Purpose:
3345  *              Handle page faults, including pseudo-faults
3346  *              used to change the wiring status of pages.
3347  *      Returns:
3348  *              Explicit continuations have been removed.
3349  *      Implementation:
3350  *              vm_fault and vm_fault_page save mucho state
3351  *              in the moral equivalent of a closure.  The state
3352  *              structure is allocated when first entering vm_fault
3353  *              and deallocated when leaving vm_fault.
3354  */
3355
3356 extern int _map_enter_debug;
3357 extern uint64_t get_current_unique_pid(void);
3358
3359 unsigned long vm_fault_collapse_total = 0;
3360 unsigned long vm_fault_collapse_skipped = 0;
3361
3362
3363 kern_return_t
3364 vm_fault_external(
3365         vm_map_t        map,
3366         vm_map_offset_t vaddr,
3367         vm_prot_t       fault_type,
3368         boolean_t       change_wiring,
3369         int             interruptible,
3370         pmap_t          caller_pmap,
3371         vm_map_offset_t caller_pmap_addr)
3372 {
3373         return vm_fault_internal(map, vaddr, fault_type, change_wiring, vm_tag_bt(),
3374                    interruptible, caller_pmap, caller_pmap_addr,
3375                    NULL);
3376 }
3377
3378 kern_return_t
3379 vm_fault(
3380         vm_map_t        map,
3381         vm_map_offset_t vaddr,
3382         vm_prot_t       fault_type,
3383         boolean_t       change_wiring,
3384         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3385         int             interruptible,
3386         pmap_t          caller_pmap,
3387         vm_map_offset_t caller_pmap_addr)
3388 {
3389         return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3390                    interruptible, caller_pmap, caller_pmap_addr,
3391                    NULL);
3392 }
3393
3394 static boolean_t
3395 current_proc_is_privileged(void)
3396 {
3397         return csproc_get_platform_binary(current_proc());
3398 }
3399
3400 uint64_t vm_copied_on_read = 0;
3401
3402 kern_return_t
3403 vm_fault_internal(
3404         vm_map_t        map,
3405         vm_map_offset_t vaddr,
3406         vm_prot_t       caller_prot,
3407         boolean_t       change_wiring,
3408         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3409         int             interruptible,
3410         pmap_t          caller_pmap,
3411         vm_map_offset_t caller_pmap_addr,
3412         ppnum_t         *physpage_p)
3413 {
3414         vm_map_version_t        version;        /* Map version for verificiation */
3415         boolean_t               wired;          /* Should mapping be wired down? */
3416         vm_object_t             object;         /* Top-level object */
3417         vm_object_offset_t      offset;         /* Top-level offset */
3418         vm_prot_t               prot;           /* Protection for mapping */
3419         vm_object_t             old_copy_object; /* Saved copy object */
3420         vm_page_t               result_page;    /* Result of vm_fault_page */
3421         vm_page_t               top_page;       /* Placeholder page */
3422         kern_return_t           kr;
3423
3424         vm_page_t               m;      /* Fast access to result_page */
3425         kern_return_t           error_code;
3426         vm_object_t             cur_object;
3427         vm_object_t             m_object = NULL;
3428         vm_object_offset_t      cur_offset;
3429         vm_page_t               cur_m;
3430         vm_object_t             new_object;
3431         int                     type_of_fault;
3432         pmap_t                  pmap;
3433         wait_interrupt_t        interruptible_state;
3434         vm_map_t                real_map = map;
3435         vm_map_t                original_map = map;
3436         boolean_t               object_locks_dropped = FALSE;
3437         vm_prot_t               fault_type;
3438         vm_prot_t               original_fault_type;
3439         struct vm_object_fault_info fault_info = {};
3440         boolean_t               need_collapse = FALSE;
3441         boolean_t               need_retry = FALSE;
3442         boolean_t               *need_retry_ptr = NULL;
3443         int                     object_lock_type = 0;
3444         int                     cur_object_lock_type;
3445         vm_object_t             top_object = VM_OBJECT_NULL;
3446         vm_object_t             written_on_object = VM_OBJECT_NULL;
3447         memory_object_t         written_on_pager = NULL;
3448         vm_object_offset_t      written_on_offset = 0;
3449         int                     throttle_delay;
3450         int                     compressed_count_delta;
3451         int                     grab_options;
3452         boolean_t               need_copy;
3453         boolean_t               need_copy_on_read;
3454         vm_map_offset_t         trace_vaddr;
3455         vm_map_offset_t         trace_real_vaddr;
3456         vm_map_offset_t         real_vaddr;
3457         boolean_t               resilient_media_retry = FALSE;
3458         vm_object_t             resilient_media_object = VM_OBJECT_NULL;
3459         vm_object_offset_t      resilient_media_offset = (vm_object_offset_t)-1;
3460
3461         real_vaddr = vaddr;
3462         trace_real_vaddr = vaddr;
3463         vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
3464
3465         if (map == kernel_map) {
3466                 trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
3467                 trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
3468         } else {
3469                 trace_vaddr = vaddr;
3470         }
3471
3472         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3473             (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3474             ((uint64_t)trace_vaddr >> 32),
3475             trace_vaddr,
3476             (map == kernel_map),
3477             0,
3478             0);
3479
3480         if (get_preemption_level() != 0) {
3481                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3482                     (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3483                     ((uint64_t)trace_vaddr >> 32),
3484                     trace_vaddr,
3485                     KERN_FAILURE,
3486                     0,
3487                     0);
3488
3489                 return KERN_FAILURE;
3490         }
3491
3492         thread_t cthread = current_thread();
3493         boolean_t rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
3494         uint64_t fstart = 0;
3495
3496         if (rtfault) {
3497                 fstart = mach_continuous_time();
3498         }
3499
3500         interruptible_state = thread_interrupt_level(interruptible);
3501
3502         fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
3503
3504         VM_STAT_INCR(faults);
3505         current_task()->faults++;
3506         original_fault_type = fault_type;
3507
3508         need_copy = FALSE;
3509         if (fault_type & VM_PROT_WRITE) {
3510                 need_copy = TRUE;
3511         }
3512
3513         if (need_copy) {
3514                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3515         } else {
3516                 object_lock_type = OBJECT_LOCK_SHARED;
3517         }
3518
3519         cur_object_lock_type = OBJECT_LOCK_SHARED;
3520
3521         if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
3522                 if (compressor_map) {
3523                         if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
3524                                 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
3525                         }
3526                 }
3527         }
3528 RetryFault:
3529         assert(written_on_object == VM_OBJECT_NULL);
3530
3531         /*
3532          * assume we will hit a page in the cache
3533          * otherwise, explicitly override with
3534          * the real fault type once we determine it
3535          */
3536         type_of_fault = DBG_CACHE_HIT_FAULT;
3537
3538         /*
3539          *      Find the backing store object and offset into
3540          *      it to begin the search.
3541          */
3542         fault_type = original_fault_type;
3543         map = original_map;
3544         vm_map_lock_read(map);
3545
3546         if (resilient_media_retry) {
3547                 /*
3548                  * If we have to insert a fake zero-filled page to hide
3549                  * a media failure to provide the real page, we need to
3550                  * resolve any pending copy-on-write on this mapping.
3551                  * VM_PROT_COPY tells vm_map_lookup_locked() to deal
3552                  * with that even if this is not a "write" fault.
3553                  */
3554                 need_copy = TRUE;
3555                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3556         }
3557
3558         kr = vm_map_lookup_locked(&map, vaddr,
3559             (fault_type | (need_copy ? VM_PROT_COPY : 0)),
3560             object_lock_type, &version,
3561             &object, &offset, &prot, &wired,
3562             &fault_info,
3563             &real_map);
3564
3565         if (kr != KERN_SUCCESS) {
3566                 vm_map_unlock_read(map);
3567                 goto done;
3568         }
3569         pmap = real_map->pmap;
3570         fault_info.interruptible = interruptible;
3571         fault_info.stealth = FALSE;
3572         fault_info.io_sync = FALSE;
3573         fault_info.mark_zf_absent = FALSE;
3574         fault_info.batch_pmap_op = FALSE;
3575
3576         if (resilient_media_retry) {
3577                 /*
3578                  * We're retrying this fault after having detected a media
3579                  * failure from a "resilient_media" mapping.
3580                  * Check that the mapping is still pointing at the object
3581                  * that just failed to provide a page.
3582                  */
3583                 assert(resilient_media_object != VM_OBJECT_NULL);
3584                 assert(resilient_media_offset != (vm_object_offset_t)-1);
3585                 if (object != VM_OBJECT_NULL &&
3586                     object == resilient_media_object &&
3587                     offset == resilient_media_offset &&
3588                     fault_info.resilient_media) {
3589                         /*
3590                          * This mapping still points at the same object
3591                          * and is still "resilient_media": proceed in
3592                          * "recovery-from-media-failure" mode, where we'll
3593                          * insert a zero-filled page in the top object.
3594                          */
3595 //                     printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
3596                 } else {
3597                         /* not recovering: reset state */
3598 //                     printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
3599                         resilient_media_retry = FALSE;
3600                         /* release our extra reference on failed object */
3601 //                     printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
3602                         vm_object_deallocate(resilient_media_object);
3603                         resilient_media_object = VM_OBJECT_NULL;
3604                         resilient_media_offset = (vm_object_offset_t)-1;
3605                 }
3606         } else {
3607                 assert(resilient_media_object == VM_OBJECT_NULL);
3608                 resilient_media_offset = (vm_object_offset_t)-1;
3609         }
3610
3611         /*
3612          * If the page is wired, we must fault for the current protection
3613          * value, to avoid further faults.
3614          */
3615         if (wired) {
3616                 fault_type = prot | VM_PROT_WRITE;
3617         }
3618         if (wired || need_copy) {
3619                 /*
3620                  * since we're treating this fault as a 'write'
3621                  * we must hold the top object lock exclusively
3622                  */
3623                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3624                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3625
3626                         if (vm_object_lock_upgrade(object) == FALSE) {
3627                                 /*
3628                                  * couldn't upgrade, so explictly
3629                                  * take the lock exclusively
3630                                  */
3631                                 vm_object_lock(object);
3632                         }
3633                 }
3634         }
3635
3636 #if     VM_FAULT_CLASSIFY
3637         /*
3638          *      Temporary data gathering code
3639          */
3640         vm_fault_classify(object, offset, fault_type);
3641 #endif
3642         /*
3643          *      Fast fault code.  The basic idea is to do as much as
3644          *      possible while holding the map lock and object locks.
3645          *      Busy pages are not used until the object lock has to
3646          *      be dropped to do something (copy, zero fill, pmap enter).
3647          *      Similarly, paging references aren't acquired until that
3648          *      point, and object references aren't used.
3649          *
3650          *      If we can figure out what to do
3651          *      (zero fill, copy on write, pmap enter) while holding
3652          *      the locks, then it gets done.  Otherwise, we give up,
3653          *      and use the original fault path (which doesn't hold
3654          *      the map lock, and relies on busy pages).
3655          *      The give up cases include:
3656          *              - Have to talk to pager.
3657          *              - Page is busy, absent or in error.
3658          *              - Pager has locked out desired access.
3659          *              - Fault needs to be restarted.
3660          *              - Have to push page into copy object.
3661          *
3662          *      The code is an infinite loop that moves one level down
3663          *      the shadow chain each time.  cur_object and cur_offset
3664          *      refer to the current object being examined. object and offset
3665          *      are the original object from the map.  The loop is at the
3666          *      top level if and only if object and cur_object are the same.
3667          *
3668          *      Invariants:  Map lock is held throughout.  Lock is held on
3669          *              original object and cur_object (if different) when
3670          *              continuing or exiting loop.
3671          *
3672          */
3673
3674 #if defined(__arm64__)
3675         /*
3676          * Fail if reading an execute-only page in a
3677          * pmap that enforces execute-only protection.
3678          */
3679         if (fault_type == VM_PROT_READ &&
3680             (prot & VM_PROT_EXECUTE) &&
3681             !(prot & VM_PROT_READ) &&
3682             pmap_enforces_execute_only(pmap)) {
3683                 vm_object_unlock(object);
3684                 vm_map_unlock_read(map);
3685                 if (real_map != map) {
3686                         vm_map_unlock(real_map);
3687                 }
3688                 kr = KERN_PROTECTION_FAILURE;
3689                 goto done;
3690         }
3691 #endif
3692
3693         /*
3694          * If this page is to be inserted in a copy delay object
3695          * for writing, and if the object has a copy, then the
3696          * copy delay strategy is implemented in the slow fault page.
3697          */
3698         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3699             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
3700                 goto handle_copy_delay;
3701         }
3702
3703         cur_object = object;
3704         cur_offset = offset;
3705
3706         grab_options = 0;
3707 #if CONFIG_SECLUDED_MEMORY
3708         if (object->can_grab_secluded) {
3709                 grab_options |= VM_PAGE_GRAB_SECLUDED;
3710         }
3711 #endif /* CONFIG_SECLUDED_MEMORY */
3712
3713         while (TRUE) {
3714                 if (!cur_object->pager_created &&
3715                     cur_object->phys_contiguous) { /* superpage */
3716                         break;
3717                 }
3718
3719                 if (cur_object->blocked_access) {
3720                         /*
3721                          * Access to this VM object has been blocked.
3722                          * Let the slow path handle it.
3723                          */
3724                         break;
3725                 }
3726
3727                 m = vm_page_lookup(cur_object, cur_offset);
3728                 m_object = NULL;
3729
3730                 if (m != VM_PAGE_NULL) {
3731                         m_object = cur_object;
3732
3733                         if (m->vmp_busy) {
3734                                 wait_result_t   result;
3735
3736                                 /*
3737                                  * in order to do the PAGE_ASSERT_WAIT, we must
3738                                  * have object that 'm' belongs to locked exclusively
3739                                  */
3740                                 if (object != cur_object) {
3741                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3742                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3743
3744                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3745                                                         /*
3746                                                          * couldn't upgrade so go do a full retry
3747                                                          * immediately since we can no longer be
3748                                                          * certain about cur_object (since we
3749                                                          * don't hold a reference on it)...
3750                                                          * first drop the top object lock
3751                                                          */
3752                                                         vm_object_unlock(object);
3753
3754                                                         vm_map_unlock_read(map);
3755                                                         if (real_map != map) {
3756                                                                 vm_map_unlock(real_map);
3757                                                         }
3758
3759                                                         goto RetryFault;
3760                                                 }
3761                                         }
3762                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3763                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3764
3765                                         if (vm_object_lock_upgrade(object) == FALSE) {
3766                                                 /*
3767                                                  * couldn't upgrade, so explictly take the lock
3768                                                  * exclusively and go relookup the page since we
3769                                                  * will have dropped the object lock and
3770                                                  * a different thread could have inserted
3771                                                  * a page at this offset
3772                                                  * no need for a full retry since we're
3773                                                  * at the top level of the object chain
3774                                                  */
3775                                                 vm_object_lock(object);
3776
3777                                                 continue;
3778                                         }
3779                                 }
3780                                 if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
3781                                         /*
3782                                          * m->vmp_busy == TRUE and the object is locked exclusively
3783                                          * if m->pageout_queue == TRUE after we acquire the
3784                                          * queues lock, we are guaranteed that it is stable on
3785                                          * the pageout queue and therefore reclaimable
3786                                          *
3787                                          * NOTE: this is only true for the internal pageout queue
3788                                          * in the compressor world
3789                                          */
3790                                         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3791
3792                                         vm_page_lock_queues();
3793
3794                                         if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
3795                                                 vm_pageout_throttle_up(m);
3796                                                 vm_page_unlock_queues();
3797
3798                                                 PAGE_WAKEUP_DONE(m);
3799                                                 goto reclaimed_from_pageout;
3800                                         }
3801                                         vm_page_unlock_queues();
3802                                 }
3803                                 if (object != cur_object) {
3804                                         vm_object_unlock(object);
3805                                 }
3806
3807                                 vm_map_unlock_read(map);
3808                                 if (real_map != map) {
3809                                         vm_map_unlock(real_map);
3810                                 }
3811
3812                                 result = PAGE_ASSERT_WAIT(m, interruptible);
3813
3814                                 vm_object_unlock(cur_object);
3815
3816                                 if (result == THREAD_WAITING) {
3817                                         result = thread_block(THREAD_CONTINUE_NULL);
3818
3819                                         counter(c_vm_fault_page_block_busy_kernel++);
3820                                 }
3821                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
3822                                         goto RetryFault;
3823                                 }
3824
3825                                 kr = KERN_ABORTED;
3826                                 goto done;
3827                         }
3828 reclaimed_from_pageout:
3829                         if (m->vmp_laundry) {
3830                                 if (object != cur_object) {
3831                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3832                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3833
3834                                                 vm_object_unlock(object);
3835                                                 vm_object_unlock(cur_object);
3836
3837                                                 vm_map_unlock_read(map);
3838                                                 if (real_map != map) {
3839                                                         vm_map_unlock(real_map);
3840                                                 }
3841
3842                                                 goto RetryFault;
3843                                         }
3844                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3845                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3846
3847                                         if (vm_object_lock_upgrade(object) == FALSE) {
3848                                                 /*
3849                                                  * couldn't upgrade, so explictly take the lock
3850                                                  * exclusively and go relookup the page since we
3851                                                  * will have dropped the object lock and
3852                                                  * a different thread could have inserted
3853                                                  * a page at this offset
3854                                                  * no need for a full retry since we're
3855                                                  * at the top level of the object chain
3856                                                  */
3857                                                 vm_object_lock(object);
3858
3859                                                 continue;
3860                                         }
3861                                 }
3862                                 vm_pageout_steal_laundry(m, FALSE);
3863                         }
3864
3865                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3866                                 /*
3867                                  * Guard page: let the slow path deal with it
3868                                  */
3869                                 break;
3870                         }
3871                         if (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
3872                                 /*
3873                                  * Unusual case... let the slow path deal with it
3874                                  */
3875                                 break;
3876                         }
3877                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
3878                                 if (object != cur_object) {
3879                                         vm_object_unlock(object);
3880                                 }
3881                                 vm_map_unlock_read(map);
3882                                 if (real_map != map) {
3883                                         vm_map_unlock(real_map);
3884                                 }
3885                                 vm_object_unlock(cur_object);
3886                                 kr = KERN_MEMORY_ERROR;
3887                                 goto done;
3888                         }
3889                         assert(m_object == VM_PAGE_OBJECT(m));
3890
3891                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) ||
3892                             (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
3893 upgrade_lock_and_retry:
3894                                 /*
3895                                  * We might need to validate this page
3896                                  * against its code signature, so we
3897                                  * want to hold the VM object exclusively.
3898                                  */
3899                                 if (object != cur_object) {
3900                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3901                                                 vm_object_unlock(object);
3902                                                 vm_object_unlock(cur_object);
3903
3904                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3905
3906                                                 vm_map_unlock_read(map);
3907                                                 if (real_map != map) {
3908                                                         vm_map_unlock(real_map);
3909                                                 }
3910
3911                                                 goto RetryFault;
3912                                         }
3913                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3914                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3915
3916                                         if (vm_object_lock_upgrade(object) == FALSE) {
3917                                                 /*
3918                                                  * couldn't upgrade, so explictly take the lock
3919                                                  * exclusively and go relookup the page since we
3920                                                  * will have dropped the object lock and
3921                                                  * a different thread could have inserted
3922                                                  * a page at this offset
3923                                                  * no need for a full retry since we're
3924                                                  * at the top level of the object chain
3925                                                  */
3926                                                 vm_object_lock(object);
3927
3928                                                 continue;
3929                                         }
3930                                 }
3931                         }
3932                         /*
3933                          *      Two cases of map in faults:
3934                          *          - At top level w/o copy object.
3935                          *          - Read fault anywhere.
3936                          *              --> must disallow write.
3937                          */
3938
3939                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3940                                 goto FastPmapEnter;
3941                         }
3942
3943                         if (!need_copy &&
3944                             !fault_info.no_copy_on_read &&
3945                             cur_object != object &&
3946                             !cur_object->internal &&
3947                             !cur_object->pager_trusted &&
3948                             vm_protect_privileged_from_untrusted &&
3949                             !((prot & VM_PROT_EXECUTE) &&
3950                             cur_object->code_signed &&
3951                             cs_process_enforcement(NULL)) &&
3952                             current_proc_is_privileged()) {
3953                                 /*
3954                                  * We're faulting on a page in "object" and
3955                                  * went down the shadow chain to "cur_object"
3956                                  * to find out that "cur_object"'s pager
3957                                  * is not "trusted", i.e. we can not trust it
3958                                  * to always return the same contents.
3959                                  * Since the target is a "privileged" process,
3960                                  * let's treat this as a copy-on-read fault, as
3961                                  * if it was a copy-on-write fault.
3962                                  * Once "object" gets a copy of this page, it
3963                                  * won't have to rely on "cur_object" to
3964                                  * provide the contents again.
3965                                  *
3966                                  * This is done by setting "need_copy" and
3967                                  * retrying the fault from the top with the
3968                                  * appropriate locking.
3969                                  *
3970                                  * Special case: if the mapping is executable
3971                                  * and the untrusted object is code-signed and
3972                                  * the process is "cs_enforced", we do not
3973                                  * copy-on-read because that would break
3974                                  * code-signing enforcement expectations (an
3975                                  * executable page must belong to a code-signed
3976                                  * object) and we can rely on code-signing
3977                                  * to re-validate the page if it gets evicted
3978                                  * and paged back in.
3979                                  */
3980 //                              printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
3981                                 vm_copied_on_read++;
3982                                 need_copy = TRUE;
3983
3984                                 vm_object_unlock(object);
3985                                 vm_object_unlock(cur_object);
3986                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3987                                 vm_map_unlock_read(map);
3988                                 if (real_map != map) {
3989                                         vm_map_unlock(real_map);
3990                                 }
3991                                 goto RetryFault;
3992                         }
3993
3994                         if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
3995                                 if (!pmap_has_prot_policy(prot)) {
3996                                         prot &= ~VM_PROT_WRITE;
3997                                 } else {
3998                                         /*
3999                                          * For a protection that the pmap cares
4000                                          * about, we must hand over the full
4001                                          * set of protections (so that the pmap
4002                                          * layer can apply any desired policy).
4003                                          * This means that cs_bypass must be
4004                                          * set, as this can force us to pass
4005                                          * RWX.
4006                                          */
4007                                         assert(fault_info.cs_bypass);
4008                                 }
4009
4010                                 if (object != cur_object) {
4011                                         /*
4012                                          * We still need to hold the top object
4013                                          * lock here to prevent a race between
4014                                          * a read fault (taking only "shared"
4015                                          * locks) and a write fault (taking
4016                                          * an "exclusive" lock on the top
4017                                          * object.
4018                                          * Otherwise, as soon as we release the
4019                                          * top lock, the write fault could
4020                                          * proceed and actually complete before
4021                                          * the read fault, and the copied page's
4022                                          * translation could then be overwritten
4023                                          * by the read fault's translation for
4024                                          * the original page.
4025                                          *
4026                                          * Let's just record what the top object
4027                                          * is and we'll release it later.
4028                                          */
4029                                         top_object = object;
4030
4031                                         /*
4032                                          * switch to the object that has the new page
4033                                          */
4034                                         object = cur_object;
4035                                         object_lock_type = cur_object_lock_type;
4036                                 }
4037 FastPmapEnter:
4038                                 assert(m_object == VM_PAGE_OBJECT(m));
4039
4040                                 /*
4041                                  * prepare for the pmap_enter...
4042                                  * object and map are both locked
4043                                  * m contains valid data
4044                                  * object == m->vmp_object
4045                                  * cur_object == NULL or it's been unlocked
4046                                  * no paging references on either object or cur_object
4047                                  */
4048                                 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
4049                                         need_retry_ptr = &need_retry;
4050                                 } else {
4051                                         need_retry_ptr = NULL;
4052                                 }
4053
4054                                 if (caller_pmap) {
4055                                         kr = vm_fault_enter(m,
4056                                             caller_pmap,
4057                                             caller_pmap_addr,
4058                                             prot,
4059                                             caller_prot,
4060                                             wired,
4061                                             change_wiring,
4062                                             wire_tag,
4063                                             &fault_info,
4064                                             need_retry_ptr,
4065                                             &type_of_fault);
4066                                 } else {
4067                                         kr = vm_fault_enter(m,
4068                                             pmap,
4069                                             vaddr,
4070                                             prot,
4071                                             caller_prot,
4072                                             wired,
4073                                             change_wiring,
4074                                             wire_tag,
4075                                             &fault_info,
4076                                             need_retry_ptr,
4077                                             &type_of_fault);
4078                                 }
4079                                 {
4080                                         int     event_code = 0;
4081
4082                                         if (m_object->internal) {
4083                                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
4084                                         } else if (m_object->object_is_shared_cache) {
4085                                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
4086                                         } else {
4087                                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
4088                                         }
4089
4090                                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
4091                                         if (need_retry == FALSE) {
4092                                                 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid(), 0, 0, 0, 0);
4093                                         }
4094                                         DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
4095                                 }
4096                                 if (kr == KERN_SUCCESS &&
4097                                     physpage_p != NULL) {
4098                                         /* for vm_map_wire_and_extract() */
4099                                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4100                                         if (prot & VM_PROT_WRITE) {
4101                                                 vm_object_lock_assert_exclusive(m_object);
4102                                                 m->vmp_dirty = TRUE;
4103                                         }
4104                                 }
4105
4106                                 if (top_object != VM_OBJECT_NULL) {
4107                                         /*
4108                                          * It's safe to drop the top object
4109                                          * now that we've done our
4110                                          * vm_fault_enter().  Any other fault
4111                                          * in progress for that virtual
4112                                          * address will either find our page
4113                                          * and translation or put in a new page
4114                                          * and translation.
4115                                          */
4116                                         vm_object_unlock(top_object);
4117                                         top_object = VM_OBJECT_NULL;
4118                                 }
4119
4120                                 if (need_collapse == TRUE) {
4121                                         vm_object_collapse(object, offset, TRUE);
4122                                 }
4123
4124                                 if (need_retry == FALSE &&
4125                                     (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
4126                                         /*
4127                                          * evaluate access pattern and update state
4128                                          * vm_fault_deactivate_behind depends on the
4129                                          * state being up to date
4130                                          */
4131                                         vm_fault_is_sequential(m_object, cur_offset, fault_info.behavior);
4132
4133                                         vm_fault_deactivate_behind(m_object, cur_offset, fault_info.behavior);
4134                                 }
4135                                 /*
4136                                  * That's it, clean up and return.
4137                                  */
4138                                 if (m->vmp_busy) {
4139                                         PAGE_WAKEUP_DONE(m);
4140                                 }
4141
4142                                 if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
4143                                         vm_object_paging_begin(m_object);
4144
4145                                         assert(written_on_object == VM_OBJECT_NULL);
4146                                         written_on_object = m_object;
4147                                         written_on_pager = m_object->pager;
4148                                         written_on_offset = m_object->paging_offset + m->vmp_offset;
4149                                 }
4150                                 vm_object_unlock(object);
4151
4152                                 vm_map_unlock_read(map);
4153                                 if (real_map != map) {
4154                                         vm_map_unlock(real_map);
4155                                 }
4156
4157                                 if (need_retry == TRUE) {
4158                                         /*
4159                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
4160                                          * at this point we don't hold any locks so it's safe
4161                                          * to ask the pmap layer to expand the page table to
4162                                          * accommodate this mapping... once expanded, we'll
4163                                          * re-drive the fault which should result in vm_fault_enter
4164                                          * being able to successfully enter the mapping this time around
4165                                          */
4166                                         (void)pmap_enter_options(
4167                                                 pmap, vaddr, 0, 0, 0, 0, 0,
4168                                                 PMAP_OPTIONS_NOENTER, NULL);
4169
4170                                         need_retry = FALSE;
4171                                         goto RetryFault;
4172                                 }
4173                                 goto done;
4174                         }
4175                         /*
4176                          * COPY ON WRITE FAULT
4177                          */
4178                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4179
4180                         /*
4181                          * If objects match, then
4182                          * object->copy must not be NULL (else control
4183                          * would be in previous code block), and we
4184                          * have a potential push into the copy object
4185                          * with which we can't cope with here.
4186                          */
4187                         if (cur_object == object) {
4188                                 /*
4189                                  * must take the slow path to
4190                                  * deal with the copy push
4191                                  */
4192                                 break;
4193                         }
4194
4195                         /*
4196                          * This is now a shadow based copy on write
4197                          * fault -- it requires a copy up the shadow
4198                          * chain.
4199                          */
4200                         assert(m_object == VM_PAGE_OBJECT(m));
4201
4202                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4203                             VM_FAULT_NEED_CS_VALIDATION(NULL, m, m_object)) {
4204                                 goto upgrade_lock_and_retry;
4205                         }
4206
4207                         /*
4208                          * Allocate a page in the original top level
4209                          * object. Give up if allocate fails.  Also
4210                          * need to remember current page, as it's the
4211                          * source of the copy.
4212                          *
4213                          * at this point we hold locks on both
4214                          * object and cur_object... no need to take
4215                          * paging refs or mark pages BUSY since
4216                          * we don't drop either object lock until
4217                          * the page has been copied and inserted
4218                          */
4219                         cur_m = m;
4220                         m = vm_page_grab_options(grab_options);
4221                         m_object = NULL;
4222
4223                         if (m == VM_PAGE_NULL) {
4224                                 /*
4225                                  * no free page currently available...
4226                                  * must take the slow path
4227                                  */
4228                                 break;
4229                         }
4230                         /*
4231                          * Now do the copy.  Mark the source page busy...
4232                          *
4233                          *      NOTE: This code holds the map lock across
4234                          *      the page copy.
4235                          */
4236                         vm_page_copy(cur_m, m);
4237                         vm_page_insert(m, object, offset);
4238                         m_object = object;
4239                         SET_PAGE_DIRTY(m, FALSE);
4240
4241                         /*
4242                          * Now cope with the source page and object
4243                          */
4244                         if (object->ref_count > 1 && cur_m->vmp_pmapped) {
4245                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4246                         }
4247
4248                         if (cur_m->vmp_clustered) {
4249                                 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4250                                 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4251                                 vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4252                         }
4253                         need_collapse = TRUE;
4254
4255                         if (!cur_object->internal &&
4256                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4257                                 /*
4258                                  * The object from which we've just
4259                                  * copied a page is most probably backed
4260                                  * by a vnode.  We don't want to waste too
4261                                  * much time trying to collapse the VM objects
4262                                  * and create a bottleneck when several tasks
4263                                  * map the same file.
4264                                  */
4265                                 if (cur_object->copy == object) {
4266                                         /*
4267                                          * Shared mapping or no COW yet.
4268                                          * We can never collapse a copy
4269                                          * object into its backing object.
4270                                          */
4271                                         need_collapse = FALSE;
4272                                 } else if (cur_object->copy == object->shadow &&
4273                                     object->shadow->resident_page_count == 0) {
4274                                         /*
4275                                          * Shared mapping after a COW occurred.
4276                                          */
4277                                         need_collapse = FALSE;
4278                                 }
4279                         }
4280                         vm_object_unlock(cur_object);
4281
4282                         if (need_collapse == FALSE) {
4283                                 vm_fault_collapse_skipped++;
4284                         }
4285                         vm_fault_collapse_total++;
4286
4287                         type_of_fault = DBG_COW_FAULT;
4288                         VM_STAT_INCR(cow_faults);
4289                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4290                         current_task()->cow_faults++;
4291
4292                         goto FastPmapEnter;
4293                 } else {
4294                         /*
4295                          * No page at cur_object, cur_offset... m == NULL
4296                          */
4297                         if (cur_object->pager_created) {
4298                                 int     compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4299
4300                                 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4301                                         int             my_fault_type;
4302                                         int             c_flags = C_DONT_BLOCK;
4303                                         boolean_t       insert_cur_object = FALSE;
4304
4305                                         /*
4306                                          * May have to talk to a pager...
4307                                          * if so, take the slow path by
4308                                          * doing a 'break' from the while (TRUE) loop
4309                                          *
4310                                          * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4311                                          * if the compressor is active and the page exists there
4312                                          */
4313                                         if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
4314                                                 break;
4315                                         }
4316
4317                                         if (map == kernel_map || real_map == kernel_map) {
4318                                                 /*
4319                                                  * can't call into the compressor with the kernel_map
4320                                                  * lock held, since the compressor may try to operate
4321                                                  * on the kernel map in order to return an empty c_segment
4322                                                  */
4323                                                 break;
4324                                         }
4325                                         if (object != cur_object) {
4326                                                 if (fault_type & VM_PROT_WRITE) {
4327                                                         c_flags |= C_KEEP;
4328                                                 } else {
4329                                                         insert_cur_object = TRUE;
4330                                                 }
4331                                         }
4332                                         if (insert_cur_object == TRUE) {
4333                                                 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4334                                                         cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4335
4336                                                         if (vm_object_lock_upgrade(cur_object) == FALSE) {
4337                                                                 /*
4338                                                                  * couldn't upgrade so go do a full retry
4339                                                                  * immediately since we can no longer be
4340                                                                  * certain about cur_object (since we
4341                                                                  * don't hold a reference on it)...
4342                                                                  * first drop the top object lock
4343                                                                  */
4344                                                                 vm_object_unlock(object);
4345
4346                                                                 vm_map_unlock_read(map);
4347                                                                 if (real_map != map) {
4348                                                                         vm_map_unlock(real_map);
4349                                                                 }
4350
4351                                                                 goto RetryFault;
4352                                                         }
4353                                                 }
4354                                         } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4355                                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4356
4357                                                 if (object != cur_object) {
4358                                                         /*
4359                                                          * we can't go for the upgrade on the top
4360                                                          * lock since the upgrade may block waiting
4361                                                          * for readers to drain... since we hold
4362                                                          * cur_object locked at this point, waiting
4363                                                          * for the readers to drain would represent
4364                                                          * a lock order inversion since the lock order
4365                                                          * for objects is the reference order in the
4366                                                          * shadown chain
4367                                                          */
4368                                                         vm_object_unlock(object);
4369                                                         vm_object_unlock(cur_object);
4370
4371                                                         vm_map_unlock_read(map);
4372                                                         if (real_map != map) {
4373                                                                 vm_map_unlock(real_map);
4374                                                         }
4375
4376                                                         goto RetryFault;
4377                                                 }
4378                                                 if (vm_object_lock_upgrade(object) == FALSE) {
4379                                                         /*
4380                                                          * couldn't upgrade, so explictly take the lock
4381                                                          * exclusively and go relookup the page since we
4382                                                          * will have dropped the object lock and
4383                                                          * a different thread could have inserted
4384                                                          * a page at this offset
4385                                                          * no need for a full retry since we're
4386                                                          * at the top level of the object chain
4387                                                          */
4388                                                         vm_object_lock(object);
4389
4390                                                         continue;
4391                                                 }
4392                                         }
4393                                         m = vm_page_grab_options(grab_options);
4394                                         m_object = NULL;
4395
4396                                         if (m == VM_PAGE_NULL) {
4397                                                 /*
4398                                                  * no free page currently available...
4399                                                  * must take the slow path
4400                                                  */
4401                                                 break;
4402                                         }
4403
4404                                         /*
4405                                          * The object is and remains locked
4406                                          * so no need to take a
4407                                          * "paging_in_progress" reference.
4408                                          */
4409                                         boolean_t shared_lock;
4410                                         if ((object == cur_object &&
4411                                             object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4412                                             (object != cur_object &&
4413                                             cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4414                                                 shared_lock = FALSE;
4415                                         } else {
4416                                                 shared_lock = TRUE;
4417                                         }
4418
4419                                         kr = vm_compressor_pager_get(
4420                                                 cur_object->pager,
4421                                                 (cur_offset +
4422                                                 cur_object->paging_offset),
4423                                                 VM_PAGE_GET_PHYS_PAGE(m),
4424                                                 &my_fault_type,
4425                                                 c_flags,
4426                                                 &compressed_count_delta);
4427
4428                                         vm_compressor_pager_count(
4429                                                 cur_object->pager,
4430                                                 compressed_count_delta,
4431                                                 shared_lock,
4432                                                 cur_object);
4433
4434                                         if (kr != KERN_SUCCESS) {
4435                                                 vm_page_release(m, FALSE);
4436                                                 m = VM_PAGE_NULL;
4437                                                 break;
4438                                         }
4439                                         m->vmp_dirty = TRUE;
4440
4441                                         /*
4442                                          * If the object is purgeable, its
4443                                          * owner's purgeable ledgers will be
4444                                          * updated in vm_page_insert() but the
4445                                          * page was also accounted for in a
4446                                          * "compressed purgeable" ledger, so
4447                                          * update that now.
4448                                          */
4449                                         if (object != cur_object &&
4450                                             !insert_cur_object) {
4451                                                 /*
4452                                                  * We're not going to insert
4453                                                  * the decompressed page into
4454                                                  * the object it came from.
4455                                                  *
4456                                                  * We're dealing with a
4457                                                  * copy-on-write fault on
4458                                                  * "object".
4459                                                  * We're going to decompress
4460                                                  * the page directly into the
4461                                                  * target "object" while
4462                                                  * keepin the compressed
4463                                                  * page for "cur_object", so
4464                                                  * no ledger update in that
4465                                                  * case.
4466                                                  */
4467                                         } else if (((cur_object->purgable ==
4468                                             VM_PURGABLE_DENY) &&
4469                                             (!cur_object->vo_ledger_tag)) ||
4470                                             (cur_object->vo_owner ==
4471                                             NULL)) {
4472                                                 /*
4473                                                  * "cur_object" is not purgeable
4474                                                  * and is not ledger-taged, or
4475                                                  * there's no owner for it,
4476                                                  * so no owner's ledgers to
4477                                                  * update.
4478                                                  */
4479                                         } else {
4480                                                 /*
4481                                                  * One less compressed
4482                                                  * purgeable/tagged page for
4483                                                  * cur_object's owner.
4484                                                  */
4485                                                 vm_object_owner_compressed_update(
4486                                                         cur_object,
4487                                                         -1);
4488                                         }
4489
4490                                         if (insert_cur_object) {
4491                                                 vm_page_insert(m, cur_object, cur_offset);
4492                                                 m_object = cur_object;
4493                                         } else {
4494                                                 vm_page_insert(m, object, offset);
4495                                                 m_object = object;
4496                                         }
4497
4498                                         if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4499                                                 /*
4500                                                  * If the page is not cacheable,
4501                                                  * we can't let its contents
4502                                                  * linger in the data cache
4503                                                  * after the decompression.
4504                                                  */
4505                                                 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
4506                                         }
4507
4508                                         type_of_fault = my_fault_type;
4509
4510                                         VM_STAT_DECOMPRESSIONS();
4511
4512                                         if (cur_object != object) {
4513                                                 if (insert_cur_object) {
4514                                                         top_object = object;
4515                                                         /*
4516                                                          * switch to the object that has the new page
4517                                                          */
4518                                                         object = cur_object;
4519                                                         object_lock_type = cur_object_lock_type;
4520                                                 } else {
4521                                                         vm_object_unlock(cur_object);
4522                                                         cur_object = object;
4523                                                 }
4524                                         }
4525                                         goto FastPmapEnter;
4526                                 }
4527                                 /*
4528                                  * existence map present and indicates
4529                                  * that the pager doesn't have this page
4530                                  */
4531                         }
4532                         if (cur_object->shadow == VM_OBJECT_NULL ||
4533                             resilient_media_retry) {
4534                                 /*
4535                                  * Zero fill fault.  Page gets
4536                                  * inserted into the original object.
4537                                  */
4538                                 if (cur_object->shadow_severed ||
4539                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
4540                                     cur_object == compressor_object ||
4541                                     cur_object == kernel_object ||
4542                                     cur_object == vm_submap_object) {
4543                                         if (object != cur_object) {
4544                                                 vm_object_unlock(cur_object);
4545                                         }
4546                                         vm_object_unlock(object);
4547
4548                                         vm_map_unlock_read(map);
4549                                         if (real_map != map) {
4550                                                 vm_map_unlock(real_map);
4551                                         }
4552
4553                                         kr = KERN_MEMORY_ERROR;
4554                                         goto done;
4555                                 }
4556                                 if (cur_object != object) {
4557                                         vm_object_unlock(cur_object);
4558
4559                                         cur_object = object;
4560                                 }
4561                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
4562                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4563
4564                                         if (vm_object_lock_upgrade(object) == FALSE) {
4565                                                 /*
4566                                                  * couldn't upgrade so do a full retry on the fault
4567                                                  * since we dropped the object lock which
4568                                                  * could allow another thread to insert
4569                                                  * a page at this offset
4570                                                  */
4571                                                 vm_map_unlock_read(map);
4572                                                 if (real_map != map) {
4573                                                         vm_map_unlock(real_map);
4574                                                 }
4575
4576                                                 goto RetryFault;
4577                                         }
4578                                 }
4579                                 if (!object->internal) {
4580                                         panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
4581                                 }
4582                                 m = vm_page_alloc(object, offset);
4583                                 m_object = NULL;
4584
4585                                 if (m == VM_PAGE_NULL) {
4586                                         /*
4587                                          * no free page currently available...
4588                                          * must take the slow path
4589                                          */
4590                                         break;
4591                                 }
4592                                 m_object = object;
4593
4594                                 /*
4595                                  * Now zero fill page...
4596                                  * the page is probably going to
4597                                  * be written soon, so don't bother
4598                                  * to clear the modified bit
4599                                  *
4600                                  *   NOTE: This code holds the map
4601                                  *   lock across the zero fill.
4602                                  */
4603                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4604
4605                                 goto FastPmapEnter;
4606                         }
4607                         /*
4608                          * On to the next level in the shadow chain
4609                          */
4610                         cur_offset += cur_object->vo_shadow_offset;
4611                         new_object = cur_object->shadow;
4612
4613                         /*
4614                          * take the new_object's lock with the indicated state
4615                          */
4616                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4617                                 vm_object_lock_shared(new_object);
4618                         } else {
4619                                 vm_object_lock(new_object);
4620                         }
4621
4622                         if (cur_object != object) {
4623                                 vm_object_unlock(cur_object);
4624                         }
4625
4626                         cur_object = new_object;
4627
4628                         continue;
4629                 }
4630         }
4631         /*
4632          * Cleanup from fast fault failure.  Drop any object
4633          * lock other than original and drop map lock.
4634          */
4635         if (object != cur_object) {
4636                 vm_object_unlock(cur_object);
4637         }
4638
4639         /*
4640          * must own the object lock exclusively at this point
4641          */
4642         if (object_lock_type == OBJECT_LOCK_SHARED) {
4643                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4644
4645                 if (vm_object_lock_upgrade(object) == FALSE) {
4646                         /*
4647                          * couldn't upgrade, so explictly
4648                          * take the lock exclusively
4649                          * no need to retry the fault at this
4650                          * point since "vm_fault_page" will
4651                          * completely re-evaluate the state
4652                          */
4653                         vm_object_lock(object);
4654                 }
4655         }
4656
4657 handle_copy_delay:
4658         vm_map_unlock_read(map);
4659         if (real_map != map) {
4660                 vm_map_unlock(real_map);
4661         }
4662
4663         if (__improbable(object == compressor_object ||
4664             object == kernel_object ||
4665             object == vm_submap_object)) {
4666                 /*
4667                  * These objects are explicitly managed and populated by the
4668                  * kernel.  The virtual ranges backed by these objects should
4669                  * either have wired pages or "holes" that are not supposed to
4670                  * be accessed at all until they get explicitly populated.
4671                  * We should never have to resolve a fault on a mapping backed
4672                  * by one of these VM objects and providing a zero-filled page
4673                  * would be wrong here, so let's fail the fault and let the
4674                  * caller crash or recover.
4675                  */
4676                 vm_object_unlock(object);
4677                 kr = KERN_MEMORY_ERROR;
4678                 goto done;
4679         }
4680
4681         assert(object != compressor_object);
4682         assert(object != kernel_object);
4683         assert(object != vm_submap_object);
4684
4685         if (resilient_media_retry) {
4686                 /*
4687                  * We could get here if we failed to get a free page
4688                  * to zero-fill and had to take the slow path again.
4689                  * Reset our "recovery-from-failed-media" state.
4690                  */
4691                 assert(resilient_media_object != VM_OBJECT_NULL);
4692                 assert(resilient_media_offset != (vm_object_offset_t)-1);
4693                 /* release our extra reference on failed object */
4694 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4695                 vm_object_deallocate(resilient_media_object);
4696                 resilient_media_object = VM_OBJECT_NULL;
4697                 resilient_media_offset = (vm_object_offset_t)-1;
4698                 resilient_media_retry = FALSE;
4699         }
4700
4701         /*
4702          * Make a reference to this object to
4703          * prevent its disposal while we are messing with
4704          * it.  Once we have the reference, the map is free
4705          * to be diddled.  Since objects reference their
4706          * shadows (and copies), they will stay around as well.
4707          */
4708         vm_object_reference_locked(object);
4709         vm_object_paging_begin(object);
4710
4711         set_thread_pagein_error(cthread, 0);
4712         error_code = 0;
4713
4714         result_page = VM_PAGE_NULL;
4715         kr = vm_fault_page(object, offset, fault_type,
4716             (change_wiring && !wired),
4717             FALSE,                /* page not looked up */
4718             &prot, &result_page, &top_page,
4719             &type_of_fault,
4720             &error_code, map->no_zero_fill,
4721             FALSE, &fault_info);
4722
4723         /*
4724          * if kr != VM_FAULT_SUCCESS, then the paging reference
4725          * has been dropped and the object unlocked... the ref_count
4726          * is still held
4727          *
4728          * if kr == VM_FAULT_SUCCESS, then the paging reference
4729          * is still held along with the ref_count on the original object
4730          *
4731          *      the object is returned locked with a paging reference
4732          *
4733          *      if top_page != NULL, then it's BUSY and the
4734          *      object it belongs to has a paging reference
4735          *      but is returned unlocked
4736          */
4737         if (kr != VM_FAULT_SUCCESS &&
4738             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4739                 if (kr == VM_FAULT_MEMORY_ERROR &&
4740                     fault_info.resilient_media) {
4741                         assertf(object->internal, "object %p", object);
4742                         /*
4743                          * This fault failed but the mapping was
4744                          * "media resilient", so we'll retry the fault in
4745                          * recovery mode to get a zero-filled page in the
4746                          * top object.
4747                          * Keep the reference on the failing object so
4748                          * that we can check that the mapping is still
4749                          * pointing to it when we retry the fault.
4750                          */
4751 //                     printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
4752                         assert(!resilient_media_retry); /* no double retry */
4753                         assert(resilient_media_object == VM_OBJECT_NULL);
4754                         assert(resilient_media_offset == (vm_object_offset_t)-1);
4755                         resilient_media_retry = TRUE;
4756                         resilient_media_object = object;
4757                         resilient_media_offset = offset;
4758 //                     printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
4759                         goto RetryFault;
4760                 } else {
4761                         /*
4762                          * we didn't succeed, lose the object reference
4763                          * immediately.
4764                          */
4765                         vm_object_deallocate(object);
4766                         object = VM_OBJECT_NULL; /* no longer valid */
4767                 }
4768
4769                 /*
4770                  * See why we failed, and take corrective action.
4771                  */
4772                 switch (kr) {
4773                 case VM_FAULT_MEMORY_SHORTAGE:
4774                         if (vm_page_wait((change_wiring) ?
4775                             THREAD_UNINT :
4776                             THREAD_ABORTSAFE)) {
4777                                 goto RetryFault;
4778                         }
4779                 /*
4780                  * fall thru
4781                  */
4782                 case VM_FAULT_INTERRUPTED:
4783                         kr = KERN_ABORTED;
4784                         goto done;
4785                 case VM_FAULT_RETRY:
4786                         goto RetryFault;
4787                 case VM_FAULT_MEMORY_ERROR:
4788                         if (error_code) {
4789                                 kr = error_code;
4790                         } else {
4791                                 kr = KERN_MEMORY_ERROR;
4792                         }
4793                         goto done;
4794                 default:
4795                         panic("vm_fault: unexpected error 0x%x from "
4796                             "vm_fault_page()\n", kr);
4797                 }
4798         }
4799         m = result_page;
4800         m_object = NULL;
4801
4802         if (m != VM_PAGE_NULL) {
4803                 m_object = VM_PAGE_OBJECT(m);
4804                 assert((change_wiring && !wired) ?
4805                     (top_page == VM_PAGE_NULL) :
4806                     ((top_page == VM_PAGE_NULL) == (m_object == object)));
4807         }
4808
4809         /*
4810          * What to do with the resulting page from vm_fault_page
4811          * if it doesn't get entered into the physical map:
4812          */
4813 #define RELEASE_PAGE(m)                                 \
4814         MACRO_BEGIN                                     \
4815         PAGE_WAKEUP_DONE(m);                            \
4816         if ( !VM_PAGE_PAGEABLE(m)) {                    \
4817                 vm_page_lockspin_queues();              \
4818                 if ( !VM_PAGE_PAGEABLE(m))              \
4819                         vm_page_activate(m);            \
4820                 vm_page_unlock_queues();                \
4821         }                                               \
4822         MACRO_END
4823
4824
4825         object_locks_dropped = FALSE;
4826         /*
4827          * We must verify that the maps have not changed
4828          * since our last lookup. vm_map_verify() needs the
4829          * map lock (shared) but we are holding object locks.
4830          * So we do a try_lock() first and, if that fails, we
4831          * drop the object locks and go in for the map lock again.
4832          */
4833         if (!vm_map_try_lock_read(original_map)) {
4834                 if (m != VM_PAGE_NULL) {
4835                         old_copy_object = m_object->copy;
4836                         vm_object_unlock(m_object);
4837                 } else {
4838                         old_copy_object = VM_OBJECT_NULL;
4839                         vm_object_unlock(object);
4840                 }
4841
4842                 object_locks_dropped = TRUE;
4843
4844                 vm_map_lock_read(original_map);
4845         }
4846
4847         if ((map != original_map) || !vm_map_verify(map, &version)) {
4848                 if (object_locks_dropped == FALSE) {
4849                         if (m != VM_PAGE_NULL) {
4850                                 old_copy_object = m_object->copy;
4851                                 vm_object_unlock(m_object);
4852                         } else {
4853                                 old_copy_object = VM_OBJECT_NULL;
4854                                 vm_object_unlock(object);
4855                         }
4856
4857                         object_locks_dropped = TRUE;
4858                 }
4859
4860                 /*
4861                  * no object locks are held at this point
4862                  */
4863                 vm_object_t             retry_object;
4864                 vm_object_offset_t      retry_offset;
4865                 vm_prot_t               retry_prot;
4866
4867                 /*
4868                  * To avoid trying to write_lock the map while another
4869                  * thread has it read_locked (in vm_map_pageable), we
4870                  * do not try for write permission.  If the page is
4871                  * still writable, we will get write permission.  If it
4872                  * is not, or has been marked needs_copy, we enter the
4873                  * mapping without write permission, and will merely
4874                  * take another fault.
4875                  */
4876                 map = original_map;
4877
4878                 kr = vm_map_lookup_locked(&map, vaddr,
4879                     fault_type & ~VM_PROT_WRITE,
4880                     OBJECT_LOCK_EXCLUSIVE, &version,
4881                     &retry_object, &retry_offset, &retry_prot,
4882                     &wired,
4883                     &fault_info,
4884                     &real_map);
4885                 pmap = real_map->pmap;
4886
4887                 if (kr != KERN_SUCCESS) {
4888                         vm_map_unlock_read(map);
4889
4890                         if (m != VM_PAGE_NULL) {
4891                                 assert(VM_PAGE_OBJECT(m) == m_object);
4892
4893                                 /*
4894                                  * retake the lock so that
4895                                  * we can drop the paging reference
4896                                  * in vm_fault_cleanup and do the
4897                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4898                                  */
4899                                 vm_object_lock(m_object);
4900
4901                                 RELEASE_PAGE(m);
4902
4903                                 vm_fault_cleanup(m_object, top_page);
4904                         } else {
4905                                 /*
4906                                  * retake the lock so that
4907                                  * we can drop the paging reference
4908                                  * in vm_fault_cleanup
4909                                  */
4910                                 vm_object_lock(object);
4911
4912                                 vm_fault_cleanup(object, top_page);
4913                         }
4914                         vm_object_deallocate(object);
4915
4916                         goto done;
4917                 }
4918                 vm_object_unlock(retry_object);
4919
4920                 if ((retry_object != object) || (retry_offset != offset)) {
4921                         vm_map_unlock_read(map);
4922                         if (real_map != map) {
4923                                 vm_map_unlock(real_map);
4924                         }
4925
4926                         if (m != VM_PAGE_NULL) {
4927                                 assert(VM_PAGE_OBJECT(m) == m_object);
4928
4929                                 /*
4930                                  * retake the lock so that
4931                                  * we can drop the paging reference
4932                                  * in vm_fault_cleanup and do the
4933                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4934                                  */
4935                                 vm_object_lock(m_object);
4936
4937                                 RELEASE_PAGE(m);
4938
4939                                 vm_fault_cleanup(m_object, top_page);
4940                         } else {
4941                                 /*
4942                                  * retake the lock so that
4943                                  * we can drop the paging reference
4944                                  * in vm_fault_cleanup
4945                                  */
4946                                 vm_object_lock(object);
4947
4948                                 vm_fault_cleanup(object, top_page);
4949                         }
4950                         vm_object_deallocate(object);
4951
4952                         goto RetryFault;
4953                 }
4954                 /*
4955                  * Check whether the protection has changed or the object
4956                  * has been copied while we left the map unlocked.
4957                  */
4958                 if (pmap_has_prot_policy(retry_prot)) {
4959                         /* If the pmap layer cares, pass the full set. */
4960                         prot = retry_prot;
4961                 } else {
4962                         prot &= retry_prot;
4963                 }
4964         }
4965
4966         if (object_locks_dropped == TRUE) {
4967                 if (m != VM_PAGE_NULL) {
4968                         vm_object_lock(m_object);
4969
4970                         if (m_object->copy != old_copy_object) {
4971                                 /*
4972                                  * The copy object changed while the top-level object
4973                                  * was unlocked, so take away write permission.
4974                                  */
4975                                 assert(!pmap_has_prot_policy(prot));
4976                                 prot &= ~VM_PROT_WRITE;
4977                         }
4978                 } else {
4979                         vm_object_lock(object);
4980                 }
4981
4982                 object_locks_dropped = FALSE;
4983         }
4984
4985         if (!need_copy &&
4986             !fault_info.no_copy_on_read &&
4987             m != VM_PAGE_NULL &&
4988             VM_PAGE_OBJECT(m) != object &&
4989             !VM_PAGE_OBJECT(m)->pager_trusted &&
4990             vm_protect_privileged_from_untrusted &&
4991             !((prot & VM_PROT_EXECUTE) &&
4992             VM_PAGE_OBJECT(m)->code_signed &&
4993             cs_process_enforcement(NULL)) &&
4994             current_proc_is_privileged()) {
4995                 /*
4996                  * We found the page we want in an "untrusted" VM object
4997                  * down the shadow chain.  Since the target is "privileged"
4998                  * we want to perform a copy-on-read of that page, so that the
4999                  * mapped object gets a stable copy and does not have to
5000                  * rely on the "untrusted" object to provide the same
5001                  * contents if the page gets reclaimed and has to be paged
5002                  * in again later on.
5003                  *
5004                  * Special case: if the mapping is executable and the untrusted
5005                  * object is code-signed and the process is "cs_enforced", we
5006                  * do not copy-on-read because that would break code-signing
5007                  * enforcement expectations (an executable page must belong
5008                  * to a code-signed object) and we can rely on code-signing
5009                  * to re-validate the page if it gets evicted and paged back in.
5010                  */
5011 //              printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5012                 vm_copied_on_read++;
5013                 need_copy_on_read = TRUE;
5014                 need_copy = TRUE;
5015         } else {
5016                 need_copy_on_read = FALSE;
5017         }
5018
5019         /*
5020          * If we want to wire down this page, but no longer have
5021          * adequate permissions, we must start all over.
5022          * If we decided to copy-on-read, we must also start all over.
5023          */
5024         if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
5025             need_copy_on_read) {
5026                 vm_map_unlock_read(map);
5027                 if (real_map != map) {
5028                         vm_map_unlock(real_map);
5029                 }
5030
5031                 if (m != VM_PAGE_NULL) {
5032                         assert(VM_PAGE_OBJECT(m) == m_object);
5033
5034                         RELEASE_PAGE(m);
5035
5036                         vm_fault_cleanup(m_object, top_page);
5037                 } else {
5038                         vm_fault_cleanup(object, top_page);
5039                 }
5040
5041                 vm_object_deallocate(object);
5042
5043                 goto RetryFault;
5044         }
5045         if (m != VM_PAGE_NULL) {
5046                 /*
5047                  * Put this page into the physical map.
5048                  * We had to do the unlock above because pmap_enter
5049                  * may cause other faults.  The page may be on
5050                  * the pageout queues.  If the pageout daemon comes
5051                  * across the page, it will remove it from the queues.
5052                  */
5053                 if (caller_pmap) {
5054                         kr = vm_fault_enter(m,
5055                             caller_pmap,
5056                             caller_pmap_addr,
5057                             prot,
5058                             caller_prot,
5059                             wired,
5060                             change_wiring,
5061                             wire_tag,
5062                             &fault_info,
5063                             NULL,
5064                             &type_of_fault);
5065                 } else {
5066                         kr = vm_fault_enter(m,
5067                             pmap,
5068                             vaddr,
5069                             prot,
5070                             caller_prot,
5071                             wired,
5072                             change_wiring,
5073                             wire_tag,
5074                             &fault_info,
5075                             NULL,
5076                             &type_of_fault);
5077                 }
5078                 assert(VM_PAGE_OBJECT(m) == m_object);
5079
5080                 {
5081                         int     event_code = 0;
5082
5083                         if (m_object->internal) {
5084                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
5085                         } else if (m_object->object_is_shared_cache) {
5086                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
5087                         } else {
5088                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
5089                         }
5090
5091                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
5092                         KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid(), 0, 0, 0, 0);
5093
5094                         DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
5095                 }
5096                 if (kr != KERN_SUCCESS) {
5097                         /* abort this page fault */
5098                         vm_map_unlock_read(map);
5099                         if (real_map != map) {
5100                                 vm_map_unlock(real_map);
5101                         }
5102                         PAGE_WAKEUP_DONE(m);
5103                         vm_fault_cleanup(m_object, top_page);
5104                         vm_object_deallocate(object);
5105                         goto done;
5106                 }
5107                 if (physpage_p != NULL) {
5108                         /* for vm_map_wire_and_extract() */
5109                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
5110                         if (prot & VM_PROT_WRITE) {
5111                                 vm_object_lock_assert_exclusive(m_object);
5112                                 m->vmp_dirty = TRUE;
5113                         }
5114                 }
5115         } else {
5116                 vm_map_entry_t          entry;
5117                 vm_map_offset_t         laddr;
5118                 vm_map_offset_t         ldelta, hdelta;
5119
5120                 /*
5121                  * do a pmap block mapping from the physical address
5122                  * in the object
5123                  */
5124
5125                 if (real_map != map) {
5126                         vm_map_unlock(real_map);
5127                 }
5128
5129                 if (original_map != map) {
5130                         vm_map_unlock_read(map);
5131                         vm_map_lock_read(original_map);
5132                         map = original_map;
5133                 }
5134                 real_map = map;
5135
5136                 laddr = vaddr;
5137                 hdelta = 0xFFFFF000;
5138                 ldelta = 0xFFFFF000;
5139
5140                 while (vm_map_lookup_entry(map, laddr, &entry)) {
5141                         if (ldelta > (laddr - entry->vme_start)) {
5142                                 ldelta = laddr - entry->vme_start;
5143                         }
5144                         if (hdelta > (entry->vme_end - laddr)) {
5145                                 hdelta = entry->vme_end - laddr;
5146                         }
5147                         if (entry->is_sub_map) {
5148                                 laddr = ((laddr - entry->vme_start)
5149                                     + VME_OFFSET(entry));
5150                                 vm_map_lock_read(VME_SUBMAP(entry));
5151
5152                                 if (map != real_map) {
5153                                         vm_map_unlock_read(map);
5154                                 }
5155                                 if (entry->use_pmap) {
5156                                         vm_map_unlock_read(real_map);
5157                                         real_map = VME_SUBMAP(entry);
5158                                 }
5159                                 map = VME_SUBMAP(entry);
5160                         } else {
5161                                 break;
5162                         }
5163                 }
5164
5165                 if (vm_map_lookup_entry(map, laddr, &entry) &&
5166                     (VME_OBJECT(entry) != NULL) &&
5167                     (VME_OBJECT(entry) == object)) {
5168                         int superpage;
5169
5170                         if (!object->pager_created &&
5171                             object->phys_contiguous &&
5172                             VME_OFFSET(entry) == 0 &&
5173                             (entry->vme_end - entry->vme_start == object->vo_size) &&
5174                             VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
5175                                 superpage = VM_MEM_SUPERPAGE;
5176                         } else {
5177                                 superpage = 0;
5178                         }
5179
5180                         if (superpage && physpage_p) {
5181                                 /* for vm_map_wire_and_extract() */
5182                                 *physpage_p = (ppnum_t)
5183                                     ((((vm_map_offset_t)
5184                                     object->vo_shadow_offset)
5185                                     + VME_OFFSET(entry)
5186                                     + (laddr - entry->vme_start))
5187                                     >> PAGE_SHIFT);
5188                         }
5189
5190                         if (caller_pmap) {
5191                                 /*
5192                                  * Set up a block mapped area
5193                                  */
5194                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5195                                 kr = pmap_map_block(caller_pmap,
5196                                     (addr64_t)(caller_pmap_addr - ldelta),
5197                                     (ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
5198                                     VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5199                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5200                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5201
5202                                 if (kr != KERN_SUCCESS) {
5203                                         goto cleanup;
5204                                 }
5205                         } else {
5206                                 /*
5207                                  * Set up a block mapped area
5208                                  */
5209                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5210                                 kr = pmap_map_block(real_map->pmap,
5211                                     (addr64_t)(vaddr - ldelta),
5212                                     (ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
5213                                     VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5214                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5215                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5216
5217                                 if (kr != KERN_SUCCESS) {
5218                                         goto cleanup;
5219                                 }
5220                         }
5221                 }
5222         }
5223
5224         /*
5225          * Success
5226          */
5227         kr = KERN_SUCCESS;
5228
5229         /*
5230          * TODO: could most of the done cases just use cleanup?
5231          */
5232 cleanup:
5233         /*
5234          * Unlock everything, and return
5235          */
5236         vm_map_unlock_read(map);
5237         if (real_map != map) {
5238                 vm_map_unlock(real_map);
5239         }
5240
5241         if (m != VM_PAGE_NULL) {
5242                 assert(VM_PAGE_OBJECT(m) == m_object);
5243
5244                 if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
5245                         vm_object_paging_begin(m_object);
5246
5247                         assert(written_on_object == VM_OBJECT_NULL);
5248                         written_on_object = m_object;
5249                         written_on_pager = m_object->pager;
5250                         written_on_offset = m_object->paging_offset + m->vmp_offset;
5251                 }
5252                 PAGE_WAKEUP_DONE(m);
5253
5254                 vm_fault_cleanup(m_object, top_page);
5255         } else {
5256                 vm_fault_cleanup(object, top_page);
5257         }
5258
5259         vm_object_deallocate(object);
5260
5261 #undef  RELEASE_PAGE
5262
5263 done:
5264         thread_interrupt_level(interruptible_state);
5265
5266         if (resilient_media_object != VM_OBJECT_NULL) {
5267                 assert(resilient_media_retry);
5268                 assert(resilient_media_offset != (vm_object_offset_t)-1);
5269                 /* release extra reference on failed object */
5270 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
5271                 vm_object_deallocate(resilient_media_object);
5272                 resilient_media_object = VM_OBJECT_NULL;
5273                 resilient_media_offset = (vm_object_offset_t)-1;
5274                 resilient_media_retry = FALSE;
5275         }
5276         assert(!resilient_media_retry);
5277
5278         /*
5279          * Only I/O throttle on faults which cause a pagein/swapin.
5280          */
5281         if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
5282                 throttle_lowpri_io(1);
5283         } else {
5284                 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
5285                         if ((throttle_delay = vm_page_throttled(TRUE))) {
5286                                 if (vm_debug_events) {
5287                                         if (type_of_fault == DBG_COMPRESSOR_FAULT) {
5288                                                 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5289                                         } else if (type_of_fault == DBG_COW_FAULT) {
5290                                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5291                                         } else {
5292                                                 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5293                                         }
5294                                 }
5295                                 delay(throttle_delay);
5296                         }
5297                 }
5298         }
5299
5300         if (written_on_object) {
5301                 vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
5302
5303                 vm_object_lock(written_on_object);
5304                 vm_object_paging_end(written_on_object);
5305                 vm_object_unlock(written_on_object);
5306
5307                 written_on_object = VM_OBJECT_NULL;
5308         }
5309
5310         if (rtfault) {
5311                 vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
5312         }
5313
5314         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5315             (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
5316             ((uint64_t)trace_vaddr >> 32),
5317             trace_vaddr,
5318             kr,
5319             type_of_fault,
5320             0);
5321
5322         return kr;
5323 }
5324
5325 /*
5326  *      vm_fault_wire:
5327  *
5328  *      Wire down a range of virtual addresses in a map.
5329  */
5330 kern_return_t
5331 vm_fault_wire(
5332         vm_map_t        map,
5333         vm_map_entry_t  entry,
5334         vm_prot_t       prot,
5335         vm_tag_t        wire_tag,
5336         pmap_t          pmap,
5337         vm_map_offset_t pmap_addr,
5338         ppnum_t         *physpage_p)
5339 {
5340         vm_map_offset_t va;
5341         vm_map_offset_t end_addr = entry->vme_end;
5342         kern_return_t   rc;
5343
5344         assert(entry->in_transition);
5345
5346         if ((VME_OBJECT(entry) != NULL) &&
5347             !entry->is_sub_map &&
5348             VME_OBJECT(entry)->phys_contiguous) {
5349                 return KERN_SUCCESS;
5350         }
5351
5352         /*
5353          *      Inform the physical mapping system that the
5354          *      range of addresses may not fault, so that
5355          *      page tables and such can be locked down as well.
5356          */
5357
5358         pmap_pageable(pmap, pmap_addr,
5359             pmap_addr + (end_addr - entry->vme_start), FALSE);
5360
5361         /*
5362          *      We simulate a fault to get the page and enter it
5363          *      in the physical map.
5364          */
5365
5366         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5367                 rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
5368                     pmap_addr + (va - entry->vme_start),
5369                     physpage_p);
5370                 if (rc != KERN_SUCCESS) {
5371                         rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
5372                             ((pmap == kernel_pmap)
5373                             ? THREAD_UNINT
5374                             : THREAD_ABORTSAFE),
5375                             pmap,
5376                             (pmap_addr +
5377                             (va - entry->vme_start)),
5378                             physpage_p);
5379                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
5380                 }
5381
5382                 if (rc != KERN_SUCCESS) {
5383                         struct vm_map_entry     tmp_entry = *entry;
5384
5385                         /* unwire wired pages */
5386                         tmp_entry.vme_end = va;
5387                         vm_fault_unwire(map,
5388                             &tmp_entry, FALSE, pmap, pmap_addr);
5389
5390                         return rc;
5391                 }
5392         }
5393         return KERN_SUCCESS;
5394 }
5395
5396 /*
5397  *      vm_fault_unwire:
5398  *
5399  *      Unwire a range of virtual addresses in a map.
5400  */
5401 void
5402 vm_fault_unwire(
5403         vm_map_t        map,
5404         vm_map_entry_t  entry,
5405         boolean_t       deallocate,
5406         pmap_t          pmap,
5407         vm_map_offset_t pmap_addr)
5408 {
5409         vm_map_offset_t va;
5410         vm_map_offset_t end_addr = entry->vme_end;
5411         vm_object_t             object;
5412         struct vm_object_fault_info fault_info = {};
5413         unsigned int    unwired_pages;
5414
5415         object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
5416
5417         /*
5418          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
5419          * do anything since such memory is wired by default.  So we don't have
5420          * anything to undo here.
5421          */
5422
5423         if (object != VM_OBJECT_NULL && object->phys_contiguous) {
5424                 return;
5425         }
5426
5427         fault_info.interruptible = THREAD_UNINT;
5428         fault_info.behavior = entry->behavior;
5429         fault_info.user_tag = VME_ALIAS(entry);
5430         if (entry->iokit_acct ||
5431             (!entry->is_sub_map && !entry->use_pmap)) {
5432                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
5433         }
5434         fault_info.lo_offset = VME_OFFSET(entry);
5435         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
5436         fault_info.no_cache = entry->no_cache;
5437         fault_info.stealth = TRUE;
5438
5439         unwired_pages = 0;
5440
5441         /*
5442          *      Since the pages are wired down, we must be able to
5443          *      get their mappings from the physical map system.
5444          */
5445
5446         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5447                 if (object == VM_OBJECT_NULL) {
5448                         if (pmap) {
5449                                 pmap_change_wiring(pmap,
5450                                     pmap_addr + (va - entry->vme_start), FALSE);
5451                         }
5452                         (void) vm_fault(map, va, VM_PROT_NONE,
5453                             TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
5454                 } else {
5455                         vm_prot_t       prot;
5456                         vm_page_t       result_page;
5457                         vm_page_t       top_page;
5458                         vm_object_t     result_object;
5459                         vm_fault_return_t result;
5460
5461                         /* cap cluster size at maximum UPL size */
5462                         upl_size_t cluster_size;
5463                         if (os_sub_overflow(end_addr, va, &cluster_size)) {
5464                                 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5465                         }
5466                         fault_info.cluster_size = cluster_size;
5467
5468                         do {
5469                                 prot = VM_PROT_NONE;
5470
5471                                 vm_object_lock(object);
5472                                 vm_object_paging_begin(object);
5473                                 result_page = VM_PAGE_NULL;
5474                                 result = vm_fault_page(
5475                                         object,
5476                                         (VME_OFFSET(entry) +
5477                                         (va - entry->vme_start)),
5478                                         VM_PROT_NONE, TRUE,
5479                                         FALSE, /* page not looked up */
5480                                         &prot, &result_page, &top_page,
5481                                         (int *)0,
5482                                         NULL, map->no_zero_fill,
5483                                         FALSE, &fault_info);
5484                         } while (result == VM_FAULT_RETRY);
5485
5486                         /*
5487                          * If this was a mapping to a file on a device that has been forcibly
5488                          * unmounted, then we won't get a page back from vm_fault_page().  Just
5489                          * move on to the next one in case the remaining pages are mapped from
5490                          * different objects.  During a forced unmount, the object is terminated
5491                          * so the alive flag will be false if this happens.  A forced unmount will
5492                          * will occur when an external disk is unplugged before the user does an
5493                          * eject, so we don't want to panic in that situation.
5494                          */
5495
5496                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive) {
5497                                 continue;
5498                         }
5499
5500                         if (result == VM_FAULT_MEMORY_ERROR &&
5501                             object == kernel_object) {
5502                                 /*
5503                                  * This must have been allocated with
5504                                  * KMA_KOBJECT and KMA_VAONLY and there's
5505                                  * no physical page at this offset.
5506                                  * We're done (no page to free).
5507                                  */
5508                                 assert(deallocate);
5509                                 continue;
5510                         }
5511
5512                         if (result != VM_FAULT_SUCCESS) {
5513                                 panic("vm_fault_unwire: failure");
5514                         }
5515
5516                         result_object = VM_PAGE_OBJECT(result_page);
5517
5518                         if (deallocate) {
5519                                 assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
5520                                     vm_page_fictitious_addr);
5521                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
5522                                 if (VM_PAGE_WIRED(result_page)) {
5523                                         unwired_pages++;
5524                                 }
5525                                 VM_PAGE_FREE(result_page);
5526                         } else {
5527                                 if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
5528                                         pmap_change_wiring(pmap,
5529                                             pmap_addr + (va - entry->vme_start), FALSE);
5530                                 }
5531
5532
5533                                 if (VM_PAGE_WIRED(result_page)) {
5534                                         vm_page_lockspin_queues();
5535                                         vm_page_unwire(result_page, TRUE);
5536                                         vm_page_unlock_queues();
5537                                         unwired_pages++;
5538                                 }
5539                                 if (entry->zero_wired_pages) {
5540                                         pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
5541                                         entry->zero_wired_pages = FALSE;
5542                                 }
5543
5544                                 PAGE_WAKEUP_DONE(result_page);
5545                         }
5546                         vm_fault_cleanup(result_object, top_page);
5547                 }
5548         }
5549
5550         /*
5551          *      Inform the physical mapping system that the range
5552          *      of addresses may fault, so that page tables and
5553          *      such may be unwired themselves.
5554          */
5555
5556         pmap_pageable(pmap, pmap_addr,
5557             pmap_addr + (end_addr - entry->vme_start), TRUE);
5558
5559         if (kernel_object == object) {
5560                 vm_tag_update_size(fault_info.user_tag, -ptoa_64(unwired_pages));
5561         }
5562 }
5563
5564 /*
5565  *      vm_fault_wire_fast:
5566  *
5567  *      Handle common case of a wire down page fault at the given address.
5568  *      If successful, the page is inserted into the associated physical map.
5569  *      The map entry is passed in to avoid the overhead of a map lookup.
5570  *
5571  *      NOTE: the given address should be truncated to the
5572  *      proper page address.
5573  *
5574  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
5575  *      a standard error specifying why the fault is fatal is returned.
5576  *
5577  *      The map in question must be referenced, and remains so.
5578  *      Caller has a read lock on the map.
5579  *
5580  *      This is a stripped version of vm_fault() for wiring pages.  Anything
5581  *      other than the common case will return KERN_FAILURE, and the caller
5582  *      is expected to call vm_fault().
5583  */
5584 static kern_return_t
5585 vm_fault_wire_fast(
5586         __unused vm_map_t       map,
5587         vm_map_offset_t va,
5588         __unused vm_prot_t       caller_prot,
5589         vm_tag_t        wire_tag,
5590         vm_map_entry_t  entry,
5591         pmap_t          pmap,
5592         vm_map_offset_t pmap_addr,
5593         ppnum_t         *physpage_p)
5594 {
5595         vm_object_t             object;
5596         vm_object_offset_t      offset;
5597         vm_page_t               m;
5598         vm_prot_t               prot;
5599         thread_t                thread = current_thread();
5600         int                     type_of_fault;
5601         kern_return_t           kr;
5602         struct vm_object_fault_info fault_info = {};
5603
5604         VM_STAT_INCR(faults);
5605
5606         if (thread != THREAD_NULL && thread->task != TASK_NULL) {
5607                 thread->task->faults++;
5608         }
5609
5610 /*
5611  *      Recovery actions
5612  */
5613
5614 #undef  RELEASE_PAGE
5615 #define RELEASE_PAGE(m) {                               \
5616         PAGE_WAKEUP_DONE(m);                            \
5617         vm_page_lockspin_queues();                      \
5618         vm_page_unwire(m, TRUE);                        \
5619         vm_page_unlock_queues();                        \
5620 }
5621
5622
5623 #undef  UNLOCK_THINGS
5624 #define UNLOCK_THINGS   {                               \
5625         vm_object_paging_end(object);                      \
5626         vm_object_unlock(object);                          \
5627 }
5628
5629 #undef  UNLOCK_AND_DEALLOCATE
5630 #define UNLOCK_AND_DEALLOCATE   {                       \
5631         UNLOCK_THINGS;                                  \
5632         vm_object_deallocate(object);                   \
5633 }
5634 /*
5635  *      Give up and have caller do things the hard way.
5636  */
5637
5638 #define GIVE_UP {                                       \
5639         UNLOCK_AND_DEALLOCATE;                          \
5640         return(KERN_FAILURE);                           \
5641 }
5642
5643
5644         /*
5645          *      If this entry is not directly to a vm_object, bail out.
5646          */
5647         if (entry->is_sub_map) {
5648                 assert(physpage_p == NULL);
5649                 return KERN_FAILURE;
5650         }
5651
5652         /*
5653          *      Find the backing store object and offset into it.
5654          */
5655
5656         object = VME_OBJECT(entry);
5657         offset = (va - entry->vme_start) + VME_OFFSET(entry);
5658         prot = entry->protection;
5659
5660         /*
5661          *      Make a reference to this object to prevent its
5662          *      disposal while we are messing with it.
5663          */
5664
5665         vm_object_lock(object);
5666         vm_object_reference_locked(object);
5667         vm_object_paging_begin(object);
5668
5669         /*
5670          *      INVARIANTS (through entire routine):
5671          *
5672          *      1)      At all times, we must either have the object
5673          *              lock or a busy page in some object to prevent
5674          *              some other thread from trying to bring in
5675          *              the same page.
5676          *
5677          *      2)      Once we have a busy page, we must remove it from
5678          *              the pageout queues, so that the pageout daemon
5679          *              will not grab it away.
5680          *
5681          */
5682
5683         /*
5684          *      Look for page in top-level object.  If it's not there or
5685          *      there's something going on, give up.
5686          */
5687         m = vm_page_lookup(object, offset);
5688         if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
5689             (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) {
5690                 GIVE_UP;
5691         }
5692         if (m->vmp_fictitious &&
5693             VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
5694                 /*
5695                  * Guard pages are fictitious pages and are never
5696                  * entered into a pmap, so let's say it's been wired...
5697                  */
5698                 kr = KERN_SUCCESS;
5699                 goto done;
5700         }
5701
5702         /*
5703          *      Wire the page down now.  All bail outs beyond this
5704          *      point must unwire the page.
5705          */
5706
5707         vm_page_lockspin_queues();
5708         vm_page_wire(m, wire_tag, TRUE);
5709         vm_page_unlock_queues();
5710
5711         /*
5712          *      Mark page busy for other threads.
5713          */
5714         assert(!m->vmp_busy);
5715         m->vmp_busy = TRUE;
5716         assert(!m->vmp_absent);
5717
5718         /*
5719          *      Give up if the page is being written and there's a copy object
5720          */
5721         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
5722                 RELEASE_PAGE(m);
5723                 GIVE_UP;
5724         }
5725
5726         fault_info.user_tag = VME_ALIAS(entry);
5727         fault_info.pmap_options = 0;
5728         if (entry->iokit_acct ||
5729             (!entry->is_sub_map && !entry->use_pmap)) {
5730                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
5731         }
5732
5733         /*
5734          *      Put this page into the physical map.
5735          */
5736         type_of_fault = DBG_CACHE_HIT_FAULT;
5737         kr = vm_fault_enter(m,
5738             pmap,
5739             pmap_addr,
5740             prot,
5741             prot,
5742             TRUE,                  /* wired */
5743             FALSE,                 /* change_wiring */
5744             wire_tag,
5745             &fault_info,
5746             NULL,
5747             &type_of_fault);
5748         if (kr != KERN_SUCCESS) {
5749                 RELEASE_PAGE(m);
5750                 GIVE_UP;
5751         }
5752
5753 done:
5754         /*
5755          *      Unlock everything, and return
5756          */
5757
5758         if (physpage_p) {
5759                 /* for vm_map_wire_and_extract() */
5760                 if (kr == KERN_SUCCESS) {
5761                         assert(object == VM_PAGE_OBJECT(m));
5762                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
5763                         if (prot & VM_PROT_WRITE) {
5764                                 vm_object_lock_assert_exclusive(object);
5765                                 m->vmp_dirty = TRUE;
5766                         }
5767                 } else {
5768                         *physpage_p = 0;
5769                 }
5770         }
5771
5772         PAGE_WAKEUP_DONE(m);
5773         UNLOCK_AND_DEALLOCATE;
5774
5775         return kr;
5776 }
5777
5778 /*
5779  *      Routine:        vm_fault_copy_cleanup
5780  *      Purpose:
5781  *              Release a page used by vm_fault_copy.
5782  */
5783
5784 static void
5785 vm_fault_copy_cleanup(
5786         vm_page_t       page,
5787         vm_page_t       top_page)
5788 {
5789         vm_object_t     object = VM_PAGE_OBJECT(page);
5790
5791         vm_object_lock(object);
5792         PAGE_WAKEUP_DONE(page);
5793         if (!VM_PAGE_PAGEABLE(page)) {
5794                 vm_page_lockspin_queues();
5795                 if (!VM_PAGE_PAGEABLE(page)) {
5796                         vm_page_activate(page);
5797                 }
5798                 vm_page_unlock_queues();
5799         }
5800         vm_fault_cleanup(object, top_page);
5801 }
5802
5803 static void
5804 vm_fault_copy_dst_cleanup(
5805         vm_page_t       page)
5806 {
5807         vm_object_t     object;
5808
5809         if (page != VM_PAGE_NULL) {
5810                 object = VM_PAGE_OBJECT(page);
5811                 vm_object_lock(object);
5812                 vm_page_lockspin_queues();
5813                 vm_page_unwire(page, TRUE);
5814                 vm_page_unlock_queues();
5815                 vm_object_paging_end(object);
5816                 vm_object_unlock(object);
5817         }
5818 }
5819
5820 /*
5821  *      Routine:        vm_fault_copy
5822  *
5823  *      Purpose:
5824  *              Copy pages from one virtual memory object to another --
5825  *              neither the source nor destination pages need be resident.
5826  *
5827  *              Before actually copying a page, the version associated with
5828  *              the destination address map wil be verified.
5829  *
5830  *      In/out conditions:
5831  *              The caller must hold a reference, but not a lock, to
5832  *              each of the source and destination objects and to the
5833  *              destination map.
5834  *
5835  *      Results:
5836  *              Returns KERN_SUCCESS if no errors were encountered in
5837  *              reading or writing the data.  Returns KERN_INTERRUPTED if
5838  *              the operation was interrupted (only possible if the
5839  *              "interruptible" argument is asserted).  Other return values
5840  *              indicate a permanent error in copying the data.
5841  *
5842  *              The actual amount of data copied will be returned in the
5843  *              "copy_size" argument.  In the event that the destination map
5844  *              verification failed, this amount may be less than the amount
5845  *              requested.
5846  */
5847 kern_return_t
5848 vm_fault_copy(
5849         vm_object_t             src_object,
5850         vm_object_offset_t      src_offset,
5851         vm_map_size_t           *copy_size,             /* INOUT */
5852         vm_object_t             dst_object,
5853         vm_object_offset_t      dst_offset,
5854         vm_map_t                dst_map,
5855         vm_map_version_t         *dst_version,
5856         int                     interruptible)
5857 {
5858         vm_page_t               result_page;
5859
5860         vm_page_t               src_page;
5861         vm_page_t               src_top_page;
5862         vm_prot_t               src_prot;
5863
5864         vm_page_t               dst_page;
5865         vm_page_t               dst_top_page;
5866         vm_prot_t               dst_prot;
5867
5868         vm_map_size_t           amount_left;
5869         vm_object_t             old_copy_object;
5870         vm_object_t             result_page_object = NULL;
5871         kern_return_t           error = 0;
5872         vm_fault_return_t       result;
5873
5874         vm_map_size_t           part_size;
5875         struct vm_object_fault_info fault_info_src = {};
5876         struct vm_object_fault_info fault_info_dst = {};
5877
5878         /*
5879          * In order not to confuse the clustered pageins, align
5880          * the different offsets on a page boundary.
5881          */
5882
5883 #define RETURN(x)                                       \
5884         MACRO_BEGIN                                     \
5885         *copy_size -= amount_left;                      \
5886         MACRO_RETURN(x);                                \
5887         MACRO_END
5888
5889         amount_left = *copy_size;
5890
5891         fault_info_src.interruptible = interruptible;
5892         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5893         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5894         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5895         fault_info_src.stealth = TRUE;
5896
5897         fault_info_dst.interruptible = interruptible;
5898         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5899         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5900         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5901         fault_info_dst.stealth = TRUE;
5902
5903         do { /* while (amount_left > 0) */
5904                 /*
5905                  * There may be a deadlock if both source and destination
5906                  * pages are the same. To avoid this deadlock, the copy must
5907                  * start by getting the destination page in order to apply
5908                  * COW semantics if any.
5909                  */
5910
5911 RetryDestinationFault:;
5912
5913                 dst_prot = VM_PROT_WRITE | VM_PROT_READ;
5914
5915                 vm_object_lock(dst_object);
5916                 vm_object_paging_begin(dst_object);
5917
5918                 /* cap cluster size at maximum UPL size */
5919                 upl_size_t cluster_size;
5920                 if (os_convert_overflow(amount_left, &cluster_size)) {
5921                         cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5922                 }
5923                 fault_info_dst.cluster_size = cluster_size;
5924
5925                 dst_page = VM_PAGE_NULL;
5926                 result = vm_fault_page(dst_object,
5927                     vm_object_trunc_page(dst_offset),
5928                     VM_PROT_WRITE | VM_PROT_READ,
5929                     FALSE,
5930                     FALSE,                    /* page not looked up */
5931                     &dst_prot, &dst_page, &dst_top_page,
5932                     (int *)0,
5933                     &error,
5934                     dst_map->no_zero_fill,
5935                     FALSE, &fault_info_dst);
5936                 switch (result) {
5937                 case VM_FAULT_SUCCESS:
5938                         break;
5939                 case VM_FAULT_RETRY:
5940                         goto RetryDestinationFault;
5941                 case VM_FAULT_MEMORY_SHORTAGE:
5942                         if (vm_page_wait(interruptible)) {
5943                                 goto RetryDestinationFault;
5944                         }
5945                 /* fall thru */
5946                 case VM_FAULT_INTERRUPTED:
5947                         RETURN(MACH_SEND_INTERRUPTED);
5948                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5949                         /* success but no VM page: fail the copy */
5950                         vm_object_paging_end(dst_object);
5951                         vm_object_unlock(dst_object);
5952                 /*FALLTHROUGH*/
5953                 case VM_FAULT_MEMORY_ERROR:
5954                         if (error) {
5955                                 return error;
5956                         } else {
5957                                 return KERN_MEMORY_ERROR;
5958                         }
5959                 default:
5960                         panic("vm_fault_copy: unexpected error 0x%x from "
5961                             "vm_fault_page()\n", result);
5962                 }
5963                 assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5964
5965                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
5966                 old_copy_object = dst_object->copy;
5967
5968                 /*
5969                  * There exists the possiblity that the source and
5970                  * destination page are the same.  But we can't
5971                  * easily determine that now.  If they are the
5972                  * same, the call to vm_fault_page() for the
5973                  * destination page will deadlock.  To prevent this we
5974                  * wire the page so we can drop busy without having
5975                  * the page daemon steal the page.  We clean up the
5976                  * top page  but keep the paging reference on the object
5977                  * holding the dest page so it doesn't go away.
5978                  */
5979
5980                 vm_page_lockspin_queues();
5981                 vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
5982                 vm_page_unlock_queues();
5983                 PAGE_WAKEUP_DONE(dst_page);
5984                 vm_object_unlock(dst_object);
5985
5986                 if (dst_top_page != VM_PAGE_NULL) {
5987                         vm_object_lock(dst_object);
5988                         VM_PAGE_FREE(dst_top_page);
5989                         vm_object_paging_end(dst_object);
5990                         vm_object_unlock(dst_object);
5991                 }
5992
5993 RetrySourceFault:;
5994
5995                 if (src_object == VM_OBJECT_NULL) {
5996                         /*
5997                          *      No source object.  We will just
5998                          *      zero-fill the page in dst_object.
5999                          */
6000                         src_page = VM_PAGE_NULL;
6001                         result_page = VM_PAGE_NULL;
6002                 } else {
6003                         vm_object_lock(src_object);
6004                         src_page = vm_page_lookup(src_object,
6005                             vm_object_trunc_page(src_offset));
6006                         if (src_page == dst_page) {
6007                                 src_prot = dst_prot;
6008                                 result_page = VM_PAGE_NULL;
6009                         } else {
6010                                 src_prot = VM_PROT_READ;
6011                                 vm_object_paging_begin(src_object);
6012
6013                                 /* cap cluster size at maximum UPL size */
6014                                 if (os_convert_overflow(amount_left, &cluster_size)) {
6015                                         cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6016                                 }
6017                                 fault_info_src.cluster_size = cluster_size;
6018
6019                                 result_page = VM_PAGE_NULL;
6020                                 result = vm_fault_page(
6021                                         src_object,
6022                                         vm_object_trunc_page(src_offset),
6023                                         VM_PROT_READ, FALSE,
6024                                         FALSE, /* page not looked up */
6025                                         &src_prot,
6026                                         &result_page, &src_top_page,
6027                                         (int *)0, &error, FALSE,
6028                                         FALSE, &fault_info_src);
6029
6030                                 switch (result) {
6031                                 case VM_FAULT_SUCCESS:
6032                                         break;
6033                                 case VM_FAULT_RETRY:
6034                                         goto RetrySourceFault;
6035                                 case VM_FAULT_MEMORY_SHORTAGE:
6036                                         if (vm_page_wait(interruptible)) {
6037                                                 goto RetrySourceFault;
6038                                         }
6039                                 /* fall thru */
6040                                 case VM_FAULT_INTERRUPTED:
6041                                         vm_fault_copy_dst_cleanup(dst_page);
6042                                         RETURN(MACH_SEND_INTERRUPTED);
6043                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
6044                                         /* success but no VM page: fail */
6045                                         vm_object_paging_end(src_object);
6046                                         vm_object_unlock(src_object);
6047                                 /*FALLTHROUGH*/
6048                                 case VM_FAULT_MEMORY_ERROR:
6049                                         vm_fault_copy_dst_cleanup(dst_page);
6050                                         if (error) {
6051                                                 return error;
6052                                         } else {
6053                                                 return KERN_MEMORY_ERROR;
6054                                         }
6055                                 default:
6056                                         panic("vm_fault_copy(2): unexpected "
6057                                             "error 0x%x from "
6058                                             "vm_fault_page()\n", result);
6059                                 }
6060
6061                                 result_page_object = VM_PAGE_OBJECT(result_page);
6062                                 assert((src_top_page == VM_PAGE_NULL) ==
6063                                     (result_page_object == src_object));
6064                         }
6065                         assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
6066                         vm_object_unlock(result_page_object);
6067                 }
6068
6069                 vm_map_lock_read(dst_map);
6070
6071                 if (!vm_map_verify(dst_map, dst_version)) {
6072                         vm_map_unlock_read(dst_map);
6073                         if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6074                                 vm_fault_copy_cleanup(result_page, src_top_page);
6075                         }
6076                         vm_fault_copy_dst_cleanup(dst_page);
6077                         break;
6078                 }
6079                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
6080
6081                 vm_object_lock(dst_object);
6082
6083                 if (dst_object->copy != old_copy_object) {
6084                         vm_object_unlock(dst_object);
6085                         vm_map_unlock_read(dst_map);
6086                         if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6087                                 vm_fault_copy_cleanup(result_page, src_top_page);
6088                         }
6089                         vm_fault_copy_dst_cleanup(dst_page);
6090                         break;
6091                 }
6092                 vm_object_unlock(dst_object);
6093
6094                 /*
6095                  *      Copy the page, and note that it is dirty
6096                  *      immediately.
6097                  */
6098
6099                 if (!page_aligned(src_offset) ||
6100                     !page_aligned(dst_offset) ||
6101                     !page_aligned(amount_left)) {
6102                         vm_object_offset_t      src_po,
6103                             dst_po;
6104
6105                         src_po = src_offset - vm_object_trunc_page(src_offset);
6106                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
6107
6108                         if (dst_po > src_po) {
6109                                 part_size = PAGE_SIZE - dst_po;
6110                         } else {
6111                                 part_size = PAGE_SIZE - src_po;
6112                         }
6113                         if (part_size > (amount_left)) {
6114                                 part_size = amount_left;
6115                         }
6116
6117                         if (result_page == VM_PAGE_NULL) {
6118                                 assert((vm_offset_t) dst_po == dst_po);
6119                                 assert((vm_size_t) part_size == part_size);
6120                                 vm_page_part_zero_fill(dst_page,
6121                                     (vm_offset_t) dst_po,
6122                                     (vm_size_t) part_size);
6123                         } else {
6124                                 assert((vm_offset_t) src_po == src_po);
6125                                 assert((vm_offset_t) dst_po == dst_po);
6126                                 assert((vm_size_t) part_size == part_size);
6127                                 vm_page_part_copy(result_page,
6128                                     (vm_offset_t) src_po,
6129                                     dst_page,
6130                                     (vm_offset_t) dst_po,
6131                                     (vm_size_t)part_size);
6132                                 if (!dst_page->vmp_dirty) {
6133                                         vm_object_lock(dst_object);
6134                                         SET_PAGE_DIRTY(dst_page, TRUE);
6135                                         vm_object_unlock(dst_object);
6136                                 }
6137                         }
6138                 } else {
6139                         part_size = PAGE_SIZE;
6140
6141                         if (result_page == VM_PAGE_NULL) {
6142                                 vm_page_zero_fill(dst_page);
6143                         } else {
6144                                 vm_object_lock(result_page_object);
6145                                 vm_page_copy(result_page, dst_page);
6146                                 vm_object_unlock(result_page_object);
6147
6148                                 if (!dst_page->vmp_dirty) {
6149                                         vm_object_lock(dst_object);
6150                                         SET_PAGE_DIRTY(dst_page, TRUE);
6151                                         vm_object_unlock(dst_object);
6152                                 }
6153                         }
6154                 }
6155
6156                 /*
6157                  *      Unlock everything, and return
6158                  */
6159
6160                 vm_map_unlock_read(dst_map);
6161
6162                 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6163                         vm_fault_copy_cleanup(result_page, src_top_page);
6164                 }
6165                 vm_fault_copy_dst_cleanup(dst_page);
6166
6167                 amount_left -= part_size;
6168                 src_offset += part_size;
6169                 dst_offset += part_size;
6170         } while (amount_left > 0);
6171
6172         RETURN(KERN_SUCCESS);
6173 #undef  RETURN
6174
6175         /*NOTREACHED*/
6176 }
6177
6178 #if     VM_FAULT_CLASSIFY
6179 /*
6180  *      Temporary statistics gathering support.
6181  */
6182
6183 /*
6184  *      Statistics arrays:
6185  */
6186 #define VM_FAULT_TYPES_MAX      5
6187 #define VM_FAULT_LEVEL_MAX      8
6188
6189 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
6190
6191 #define VM_FAULT_TYPE_ZERO_FILL 0
6192 #define VM_FAULT_TYPE_MAP_IN    1
6193 #define VM_FAULT_TYPE_PAGER     2
6194 #define VM_FAULT_TYPE_COPY      3
6195 #define VM_FAULT_TYPE_OTHER     4
6196
6197
6198 void
6199 vm_fault_classify(vm_object_t           object,
6200     vm_object_offset_t    offset,
6201     vm_prot_t             fault_type)
6202 {
6203         int             type, level = 0;
6204         vm_page_t       m;
6205
6206         while (TRUE) {
6207                 m = vm_page_lookup(object, offset);
6208                 if (m != VM_PAGE_NULL) {
6209                         if (m->vmp_busy || m->vmp_error || m->vmp_restart || m->vmp_absent) {
6210                                 type = VM_FAULT_TYPE_OTHER;
6211                                 break;
6212                         }
6213                         if (((fault_type & VM_PROT_WRITE) == 0) ||
6214                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
6215                                 type = VM_FAULT_TYPE_MAP_IN;
6216                                 break;
6217                         }
6218                         type = VM_FAULT_TYPE_COPY;
6219                         break;
6220                 } else {
6221                         if (object->pager_created) {
6222                                 type = VM_FAULT_TYPE_PAGER;
6223                                 break;
6224                         }
6225                         if (object->shadow == VM_OBJECT_NULL) {
6226                                 type = VM_FAULT_TYPE_ZERO_FILL;
6227                                 break;
6228                         }
6229
6230                         offset += object->vo_shadow_offset;
6231                         object = object->shadow;
6232                         level++;
6233                         continue;
6234                 }
6235         }
6236
6237         if (level > VM_FAULT_LEVEL_MAX) {
6238                 level = VM_FAULT_LEVEL_MAX;
6239         }
6240
6241         vm_fault_stats[type][level] += 1;
6242
6243         return;
6244 }
6245
6246 /* cleanup routine to call from debugger */
6247
6248 void
6249 vm_fault_classify_init(void)
6250 {
6251         int type, level;
6252
6253         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
6254                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
6255                         vm_fault_stats[type][level] = 0;
6256                 }
6257         }
6258
6259         return;
6260 }
6261 #endif  /* VM_FAULT_CLASSIFY */
6262
6263 vm_offset_t
6264 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
6265 {
6266         vm_map_entry_t  entry;
6267         vm_object_t     object;
6268         vm_offset_t     object_offset;
6269         vm_page_t       m;
6270         int             compressor_external_state, compressed_count_delta;
6271         int             compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
6272         int             my_fault_type = VM_PROT_READ;
6273         kern_return_t   kr;
6274
6275         if (not_in_kdp) {
6276                 panic("kdp_lightweight_fault called from outside of debugger context");
6277         }
6278
6279         assert(map != VM_MAP_NULL);
6280
6281         assert((cur_target_addr & PAGE_MASK) == 0);
6282         if ((cur_target_addr & PAGE_MASK) != 0) {
6283                 return 0;
6284         }
6285
6286         if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
6287                 return 0;
6288         }
6289
6290         if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
6291                 return 0;
6292         }
6293
6294         if (entry->is_sub_map) {
6295                 return 0;
6296         }
6297
6298         object = VME_OBJECT(entry);
6299         if (object == VM_OBJECT_NULL) {
6300                 return 0;
6301         }
6302
6303         object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
6304
6305         while (TRUE) {
6306                 if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
6307                         return 0;
6308                 }
6309
6310                 if (object->pager_created && (object->paging_in_progress ||
6311                     object->activity_in_progress)) {
6312                         return 0;
6313                 }
6314
6315                 m = kdp_vm_page_lookup(object, object_offset);
6316
6317                 if (m != VM_PAGE_NULL) {
6318                         if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
6319                                 return 0;
6320                         }
6321
6322                         if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || m->vmp_error || m->vmp_cleaning ||
6323                             m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
6324                                 return 0;
6325                         }
6326
6327                         assert(!m->vmp_private);
6328                         if (m->vmp_private) {
6329                                 return 0;
6330                         }
6331
6332                         assert(!m->vmp_fictitious);
6333                         if (m->vmp_fictitious) {
6334                                 return 0;
6335                         }
6336
6337                         assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
6338                         if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6339                                 return 0;
6340                         }
6341
6342                         return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
6343                 }
6344
6345                 compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
6346
6347                 if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
6348                         if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
6349                                 kr = vm_compressor_pager_get(object->pager, (object_offset + object->paging_offset),
6350                                     kdp_compressor_decompressed_page_ppnum, &my_fault_type,
6351                                     compressor_flags, &compressed_count_delta);
6352                                 if (kr == KERN_SUCCESS) {
6353                                         return kdp_compressor_decompressed_page_paddr;
6354                                 } else {
6355                                         return 0;
6356                                 }
6357                         }
6358                 }
6359
6360                 if (object->shadow == VM_OBJECT_NULL) {
6361                         return 0;
6362                 }
6363
6364                 object_offset += object->vo_shadow_offset;
6365                 object = object->shadow;
6366         }
6367 }
6368
6369 /*
6370  * vm_page_validate_cs_fast():
6371  * Performs a few quick checks to determine if the page's code signature
6372  * really needs to be fully validated.  It could:
6373  *      1. have been modified (i.e. automatically tainted),
6374  *      2. have already been validated,
6375  *      3. have already been found to be tainted,
6376  *      4. no longer have a backing store.
6377  * Returns FALSE if the page needs to be fully validated.
6378  */
6379 static boolean_t
6380 vm_page_validate_cs_fast(
6381         vm_page_t       page)
6382 {
6383         vm_object_t     object;
6384
6385         object = VM_PAGE_OBJECT(page);
6386         vm_object_lock_assert_held(object);
6387
6388         if (page->vmp_wpmapped && !page->vmp_cs_tainted) {
6389                 /*
6390                  * This page was mapped for "write" access sometime in the
6391                  * past and could still be modifiable in the future.
6392                  * Consider it tainted.
6393                  * [ If the page was already found to be "tainted", no
6394                  * need to re-validate. ]
6395                  */
6396                 vm_object_lock_assert_exclusive(object);
6397                 page->vmp_cs_validated = TRUE;
6398                 page->vmp_cs_tainted = TRUE;
6399                 if (cs_debug) {
6400                         printf("CODESIGNING: %s: "
6401                             "page %p obj %p off 0x%llx "
6402                             "was modified\n",
6403                             __FUNCTION__,
6404                             page, object, page->vmp_offset);
6405                 }
6406                 vm_cs_validated_dirtied++;
6407         }
6408
6409         if (page->vmp_cs_validated || page->vmp_cs_tainted) {
6410                 return TRUE;
6411         }
6412         vm_object_lock_assert_exclusive(object);
6413
6414 #if CHECK_CS_VALIDATION_BITMAP
6415         kern_return_t kr;
6416
6417         kr = vnode_pager_cs_check_validation_bitmap(
6418                 object->pager,
6419                 page->vmp_offset + object->paging_offset,
6420                 CS_BITMAP_CHECK);
6421         if (kr == KERN_SUCCESS) {
6422                 page->vmp_cs_validated = TRUE;
6423                 page->vmp_cs_tainted = FALSE;
6424                 vm_cs_bitmap_validated++;
6425                 return TRUE;
6426         }
6427 #endif /* CHECK_CS_VALIDATION_BITMAP */
6428
6429         if (!object->alive || object->terminating || object->pager == NULL) {
6430                 /*
6431                  * The object is terminating and we don't have its pager
6432                  * so we can't validate the data...
6433                  */
6434                 return TRUE;
6435         }
6436
6437         /* we need to really validate this page */
6438         vm_object_lock_assert_exclusive(object);
6439         return FALSE;
6440 }
6441
6442 void
6443 vm_page_validate_cs_mapped_slow(
6444         vm_page_t       page,
6445         const void      *kaddr)
6446 {
6447         vm_object_t             object;
6448         memory_object_offset_t  mo_offset;
6449         memory_object_t         pager;
6450         struct vnode            *vnode;
6451         boolean_t               validated;
6452         unsigned                tainted;
6453
6454         assert(page->vmp_busy);
6455         object = VM_PAGE_OBJECT(page);
6456         vm_object_lock_assert_exclusive(object);
6457
6458         vm_cs_validates++;
6459
6460         /*
6461          * Since we get here to validate a page that was brought in by
6462          * the pager, we know that this pager is all setup and ready
6463          * by now.
6464          */
6465         assert(object->code_signed);
6466         assert(!object->internal);
6467         assert(object->pager != NULL);
6468         assert(object->pager_ready);
6469
6470         pager = object->pager;
6471         assert(object->paging_in_progress);
6472         vnode = vnode_pager_lookup_vnode(pager);
6473         mo_offset = page->vmp_offset + object->paging_offset;
6474
6475         /* verify the SHA1 hash for this page */
6476         tainted = 0;
6477         validated = cs_validate_range(vnode,
6478             pager,
6479             mo_offset,
6480             (const void *)((const char *)kaddr),
6481             PAGE_SIZE_64,
6482             &tainted);
6483
6484         if (tainted & CS_VALIDATE_TAINTED) {
6485                 page->vmp_cs_tainted = TRUE;
6486         }
6487         if (tainted & CS_VALIDATE_NX) {
6488                 page->vmp_cs_nx = TRUE;
6489         }
6490         if (validated) {
6491                 page->vmp_cs_validated = TRUE;
6492         }
6493
6494 #if CHECK_CS_VALIDATION_BITMAP
6495         if (page->vmp_cs_validated && !page->vmp_cs_tainted) {
6496                 vnode_pager_cs_check_validation_bitmap(object->pager,
6497                     mo_offset,
6498                     CS_BITMAP_SET);
6499         }
6500 #endif /* CHECK_CS_VALIDATION_BITMAP */
6501 }
6502
6503 void
6504 vm_page_validate_cs_mapped(
6505         vm_page_t       page,
6506         const void      *kaddr)
6507 {
6508         if (!vm_page_validate_cs_fast(page)) {
6509                 vm_page_validate_cs_mapped_slow(page, kaddr);
6510         }
6511 }
6512
6513 void
6514 vm_page_validate_cs(
6515         vm_page_t       page)
6516 {
6517         vm_object_t             object;
6518         vm_object_offset_t      offset;
6519         vm_map_offset_t         koffset;
6520         vm_map_size_t           ksize;
6521         vm_offset_t             kaddr;
6522         kern_return_t           kr;
6523         boolean_t               busy_page;
6524         boolean_t               need_unmap;
6525
6526         object = VM_PAGE_OBJECT(page);
6527         vm_object_lock_assert_held(object);
6528
6529         if (vm_page_validate_cs_fast(page)) {
6530                 return;
6531         }
6532         vm_object_lock_assert_exclusive(object);
6533
6534         assert(object->code_signed);
6535         offset = page->vmp_offset;
6536
6537         busy_page = page->vmp_busy;
6538         if (!busy_page) {
6539                 /* keep page busy while we map (and unlock) the VM object */
6540                 page->vmp_busy = TRUE;
6541         }
6542
6543         /*
6544          * Take a paging reference on the VM object
6545          * to protect it from collapse or bypass,
6546          * and keep it from disappearing too.
6547          */
6548         vm_object_paging_begin(object);
6549
6550         /* map the page in the kernel address space */
6551         ksize = PAGE_SIZE_64;
6552         koffset = 0;
6553         need_unmap = FALSE;
6554         kr = vm_paging_map_object(page,
6555             object,
6556             offset,
6557             VM_PROT_READ,
6558             FALSE,                       /* can't unlock object ! */
6559             &ksize,
6560             &koffset,
6561             &need_unmap);
6562         if (kr != KERN_SUCCESS) {
6563                 panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
6564         }
6565         kaddr = CAST_DOWN(vm_offset_t, koffset);
6566
6567         /* validate the mapped page */
6568         vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
6569
6570         assert(page->vmp_busy);
6571         assert(object == VM_PAGE_OBJECT(page));
6572         vm_object_lock_assert_exclusive(object);
6573
6574         if (!busy_page) {
6575                 PAGE_WAKEUP_DONE(page);
6576         }
6577         if (need_unmap) {
6578                 /* unmap the map from the kernel address space */
6579                 vm_paging_unmap_object(object, koffset, koffset + ksize);
6580                 koffset = 0;
6581                 ksize = 0;
6582                 kaddr = 0;
6583         }
6584         vm_object_paging_end(object);
6585 }
6586
6587 void
6588 vm_page_validate_cs_mapped_chunk(
6589         vm_page_t       page,
6590         const void      *kaddr,
6591         vm_offset_t     chunk_offset,
6592         vm_size_t       chunk_size,
6593         boolean_t       *validated_p,
6594         unsigned        *tainted_p)
6595 {
6596         vm_object_t             object;
6597         vm_object_offset_t      offset, offset_in_page;
6598         memory_object_t         pager;
6599         struct vnode            *vnode;
6600         boolean_t               validated;
6601         unsigned                tainted;
6602
6603         *validated_p = FALSE;
6604         *tainted_p = 0;
6605
6606         assert(page->vmp_busy);
6607         object = VM_PAGE_OBJECT(page);
6608         vm_object_lock_assert_exclusive(object);
6609
6610         assert(object->code_signed);
6611         offset = page->vmp_offset;
6612
6613         if (!object->alive || object->terminating || object->pager == NULL) {
6614                 /*
6615                  * The object is terminating and we don't have its pager
6616                  * so we can't validate the data...
6617                  */
6618                 return;
6619         }
6620         /*
6621          * Since we get here to validate a page that was brought in by
6622          * the pager, we know that this pager is all setup and ready
6623          * by now.
6624          */
6625         assert(!object->internal);
6626         assert(object->pager != NULL);
6627         assert(object->pager_ready);
6628
6629         pager = object->pager;
6630         assert(object->paging_in_progress);
6631         vnode = vnode_pager_lookup_vnode(pager);
6632
6633         /* verify the signature for this chunk */
6634         offset_in_page = chunk_offset;
6635         assert(offset_in_page < PAGE_SIZE);
6636
6637         tainted = 0;
6638         validated = cs_validate_range(vnode,
6639             pager,
6640             (object->paging_offset +
6641             offset +
6642             offset_in_page),
6643             (const void *)((const char *)kaddr
6644             + offset_in_page),
6645             chunk_size,
6646             &tainted);
6647         if (validated) {
6648                 *validated_p = TRUE;
6649         }
6650         if (tainted) {
6651                 *tainted_p = tainted;
6652         }
6653 }
6654
6655 static void
6656 vm_rtfrecord_lock(void)
6657 {
6658         lck_spin_lock(&vm_rtfr_slock);
6659 }
6660
6661 static void
6662 vm_rtfrecord_unlock(void)
6663 {
6664         lck_spin_unlock(&vm_rtfr_slock);
6665 }
6666
6667 unsigned int
6668 vmrtfaultinfo_bufsz(void)
6669 {
6670         return vmrtf_num_records * sizeof(vm_rtfault_record_t);
6671 }
6672
6673 #include <kern/backtrace.h>
6674
6675 static void
6676 vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
6677 {
6678         uint64_t fend = mach_continuous_time();
6679
6680         uint64_t cfpc = 0;
6681         uint64_t ctid = cthread->thread_id;
6682         uint64_t cupid = get_current_unique_pid();
6683
6684         uintptr_t bpc = 0;
6685         int btr = 0;
6686         bool u64 = false;
6687
6688         /* Capture a single-frame backtrace; this extracts just the program
6689          * counter at the point of the fault into "bpc", and should perform no
6690          * further user stack traversals, thus avoiding copyin()s and further
6691          * faults.
6692          */
6693         unsigned int bfrs = backtrace_thread_user(cthread, &bpc, 1U, &btr, &u64, NULL);
6694
6695         if ((btr == 0) && (bfrs > 0)) {
6696                 cfpc = bpc;
6697         }
6698
6699         assert((fstart != 0) && fend >= fstart);
6700         vm_rtfrecord_lock();
6701         assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
6702
6703         vmrtfrs.vmrtf_total++;
6704         vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
6705
6706         cvmr->rtfabstime = fstart;
6707         cvmr->rtfduration = fend - fstart;
6708         cvmr->rtfaddr = fault_vaddr;
6709         cvmr->rtfpc = cfpc;
6710         cvmr->rtftype = type_of_fault;
6711         cvmr->rtfupid = cupid;
6712         cvmr->rtftid = ctid;
6713
6714         if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
6715                 vmrtfrs.vmrtfr_curi = 0;
6716         }
6717
6718         vm_rtfrecord_unlock();
6719 }
6720
6721 int
6722 vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, int vrecordsz, void *vrecords, int *vmrtfrv)
6723 {
6724         vm_rtfault_record_t *cvmrd = vrecords;
6725         size_t residue = vrecordsz;
6726         int numextracted = 0;
6727         boolean_t early_exit = FALSE;
6728
6729         vm_rtfrecord_lock();
6730
6731         for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
6732                 if (residue < sizeof(vm_rtfault_record_t)) {
6733                         early_exit = TRUE;
6734                         break;
6735                 }
6736
6737                 if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
6738 #if     DEVELOPMENT || DEBUG
6739                         if (isroot == FALSE) {
6740                                 continue;
6741                         }
6742 #else
6743                         continue;
6744 #endif /* DEVDEBUG */
6745                 }
6746
6747                 *cvmrd = vmrtfrs.vm_rtf_records[vmfi];
6748                 cvmrd++;
6749                 residue -= sizeof(vm_rtfault_record_t);
6750                 numextracted++;
6751         }
6752
6753         vm_rtfrecord_unlock();
6754
6755         *vmrtfrv = numextracted;
6756         return early_exit;
6757 }