osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75                                 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/xpr.h>
  86 #include <kern/mach_param.h>
  87 #include <kern/macro_help.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/misc_protos.h>
  90 #include <kern/policy_internal.h>
  91
  92 #include <vm/vm_compressor.h>
  93 #include <vm/vm_compressor_pager.h>
  94 #include <vm/vm_fault.h>
  95 #include <vm/vm_map.h>
  96 #include <vm/vm_object.h>
  97 #include <vm/vm_page.h>
  98 #include <vm/vm_kern.h>
  99 #include <vm/pmap.h>
 100 #include <vm/vm_pageout.h>
 101 #include <vm/vm_protos.h>
 102 #include <vm/vm_external.h>
 103 #include <vm/memory_object.h>
 104 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 105 #include <vm/vm_shared_region.h>
 106
 107 #include <sys/codesign.h>
 108 #include <sys/reason.h>
 109 #include <sys/signalvar.h>
 110
 111 #include <san/kasan.h>
 112
 113 #define VM_FAULT_CLASSIFY       0
 114
 115 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 116
 117 unsigned int    vm_object_pagein_throttle = 16;
 118
 119 /*
 120  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 121  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 122  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 123  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 124  * keep the UI active so that the user has a chance to kill the offending task before the system
 125  * completely hangs.
 126  *
 127  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 128  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 129  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 130  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 131  */
 132
 133 extern void throttle_lowpri_io(int);
 134
 135 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
 136
 137 uint64_t vm_hard_throttle_threshold;
 138
 139
 140
 141 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (vm_wants_task_throttled(current_task()) ||     \
 142                                                  (vm_page_free_count < vm_page_throttle_limit && \
 143                                                   proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) > THROTTLE_LEVEL_THROTTLED))
 144
 145
 146 #define HARD_THROTTLE_DELAY     5000    /* 5000 us == 5 ms */
 147 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
 148
 149 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
 150 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
 151
 152
 153 boolean_t current_thread_aborted(void);
 154
 155 /* Forward declarations of internal routines. */
 156 static kern_return_t vm_fault_wire_fast(
 157                                 vm_map_t        map,
 158                                 vm_map_offset_t va,
 159                                 vm_prot_t       prot,
 160                                 vm_tag_t        wire_tag,
 161                                 vm_map_entry_t  entry,
 162                                 pmap_t          pmap,
 163                                 vm_map_offset_t pmap_addr,
 164                                 ppnum_t         *physpage_p);
 165
 166 static kern_return_t vm_fault_internal(
 167                 vm_map_t        map,
 168                 vm_map_offset_t vaddr,
 169                 vm_prot_t       caller_prot,
 170                 boolean_t       change_wiring,
 171                 vm_tag_t        wire_tag,
 172                 int             interruptible,
 173                 pmap_t          pmap,
 174                 vm_map_offset_t pmap_addr,
 175                 ppnum_t         *physpage_p);
 176
 177 static void vm_fault_copy_cleanup(
 178                                 vm_page_t       page,
 179                                 vm_page_t       top_page);
 180
 181 static void vm_fault_copy_dst_cleanup(
 182                                 vm_page_t       page);
 183
 184 #if     VM_FAULT_CLASSIFY
 185 extern void vm_fault_classify(vm_object_t       object,
 186                           vm_object_offset_t    offset,
 187                           vm_prot_t             fault_type);
 188
 189 extern void vm_fault_classify_init(void);
 190 #endif
 191
 192 unsigned long vm_pmap_enter_blocked = 0;
 193 unsigned long vm_pmap_enter_retried = 0;
 194
 195 unsigned long vm_cs_validates = 0;
 196 unsigned long vm_cs_revalidates = 0;
 197 unsigned long vm_cs_query_modified = 0;
 198 unsigned long vm_cs_validated_dirtied = 0;
 199 unsigned long vm_cs_bitmap_validated = 0;
 200
 201 void vm_pre_fault(vm_map_offset_t);
 202
 203 extern char *kdp_compressor_decompressed_page;
 204 extern addr64_t kdp_compressor_decompressed_page_paddr;
 205 extern ppnum_t  kdp_compressor_decompressed_page_ppnum;
 206
 207 /*
 208  *      Routine:        vm_fault_init
 209  *      Purpose:
 210  *              Initialize our private data structures.
 211  */
 212 void
 213 vm_fault_init(void)
 214 {
 215         int i, vm_compressor_temp;
 216         boolean_t need_default_val = TRUE;
 217         /*
 218          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 219          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 220          * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
 221          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 222          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 223          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 224          */
 225
 226         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
 227
 228         /*
 229          * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
 230          */
 231
 232         if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof (vm_compressor_temp))) {
 233                 for ( i = 0; i < VM_PAGER_MAX_MODES; i++) {
 234                         if (vm_compressor_temp > 0 &&
 235                             ((vm_compressor_temp & ( 1 << i)) == vm_compressor_temp)) {
 236                                 need_default_val = FALSE;
 237                                 vm_compressor_mode = vm_compressor_temp;
 238                                 break;
 239                         }
 240                 }
 241                 if (need_default_val)
 242                         printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
 243         }
 244         if (need_default_val) {
 245                 /* If no boot arg or incorrect boot arg, try device tree. */
 246                 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
 247         }
 248         PE_parse_boot_argn("vm_compressor_threads", &vm_compressor_thread_count, sizeof (vm_compressor_thread_count));
 249
 250         printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
 251 }
 252
 253 /*
 254  *      Routine:        vm_fault_cleanup
 255  *      Purpose:
 256  *              Clean up the result of vm_fault_page.
 257  *      Results:
 258  *              The paging reference for "object" is released.
 259  *              "object" is unlocked.
 260  *              If "top_page" is not null,  "top_page" is
 261  *              freed and the paging reference for the object
 262  *              containing it is released.
 263  *
 264  *      In/out conditions:
 265  *              "object" must be locked.
 266  */
 267 void
 268 vm_fault_cleanup(
 269         vm_object_t     object,
 270         vm_page_t       top_page)
 271 {
 272         vm_object_paging_end(object);
 273         vm_object_unlock(object);
 274
 275         if (top_page != VM_PAGE_NULL) {
 276                 object = VM_PAGE_OBJECT(top_page);
 277
 278                 vm_object_lock(object);
 279                 VM_PAGE_FREE(top_page);
 280                 vm_object_paging_end(object);
 281                 vm_object_unlock(object);
 282         }
 283 }
 284
 285 #if     MACH_CLUSTER_STATS
 286 #define MAXCLUSTERPAGES 16
 287 struct {
 288         unsigned long pages_in_cluster;
 289         unsigned long pages_at_higher_offsets;
 290         unsigned long pages_at_lower_offsets;
 291 } cluster_stats_in[MAXCLUSTERPAGES];
 292 #define CLUSTER_STAT(clause)    clause
 293 #define CLUSTER_STAT_HIGHER(x)  \
 294         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 295 #define CLUSTER_STAT_LOWER(x)   \
 296          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 297 #define CLUSTER_STAT_CLUSTER(x) \
 298         ((cluster_stats_in[(x)].pages_in_cluster)++)
 299 #else   /* MACH_CLUSTER_STATS */
 300 #define CLUSTER_STAT(clause)
 301 #endif  /* MACH_CLUSTER_STATS */
 302
 303 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 304
 305
 306 boolean_t       vm_page_deactivate_behind = TRUE;
 307 /*
 308  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 309  */
 310 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 311 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 312                                                                 /* we use it to size an array on the stack */
 313
 314 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 315
 316 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 317
 318 /*
 319  * vm_page_is_sequential
 320  *
 321  * Determine if sequential access is in progress
 322  * in accordance with the behavior specified.
 323  * Update state to indicate current access pattern.
 324  *
 325  * object must have at least the shared lock held
 326  */
 327 static
 328 void
 329 vm_fault_is_sequential(
 330         vm_object_t             object,
 331         vm_object_offset_t      offset,
 332         vm_behavior_t           behavior)
 333 {
 334         vm_object_offset_t      last_alloc;
 335         int                     sequential;
 336         int                     orig_sequential;
 337
 338         last_alloc = object->last_alloc;
 339         sequential = object->sequential;
 340         orig_sequential = sequential;
 341
 342         switch (behavior) {
 343         case VM_BEHAVIOR_RANDOM:
 344                 /*
 345                  * reset indicator of sequential behavior
 346                  */
 347                 sequential = 0;
 348                 break;
 349
 350         case VM_BEHAVIOR_SEQUENTIAL:
 351                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 352                         /*
 353                          * advance indicator of sequential behavior
 354                          */
 355                         if (sequential < MAX_SEQUENTIAL_RUN)
 356                                 sequential += PAGE_SIZE;
 357                 } else {
 358                         /*
 359                          * reset indicator of sequential behavior
 360                          */
 361                         sequential = 0;
 362                 }
 363                 break;
 364
 365         case VM_BEHAVIOR_RSEQNTL:
 366                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 367                         /*
 368                          * advance indicator of sequential behavior
 369                          */
 370                         if (sequential > -MAX_SEQUENTIAL_RUN)
 371                                 sequential -= PAGE_SIZE;
 372                 } else {
 373                         /*
 374                          * reset indicator of sequential behavior
 375                          */
 376                         sequential = 0;
 377                 }
 378                 break;
 379
 380         case VM_BEHAVIOR_DEFAULT:
 381         default:
 382                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 383                         /*
 384                          * advance indicator of sequential behavior
 385                          */
 386                         if (sequential < 0)
 387                                 sequential = 0;
 388                         if (sequential < MAX_SEQUENTIAL_RUN)
 389                                 sequential += PAGE_SIZE;
 390
 391                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 392                         /*
 393                          * advance indicator of sequential behavior
 394                          */
 395                         if (sequential > 0)
 396                                 sequential = 0;
 397                         if (sequential > -MAX_SEQUENTIAL_RUN)
 398                                 sequential -= PAGE_SIZE;
 399                 } else {
 400                         /*
 401                          * reset indicator of sequential behavior
 402                          */
 403                         sequential = 0;
 404                 }
 405                 break;
 406         }
 407         if (sequential != orig_sequential) {
 408                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 409                         /*
 410                          * if someone else has already updated object->sequential
 411                          * don't bother trying to update it or object->last_alloc
 412                          */
 413                         return;
 414                 }
 415         }
 416         /*
 417          * I'd like to do this with a OSCompareAndSwap64, but that
 418          * doesn't exist for PPC...  however, it shouldn't matter
 419          * that much... last_alloc is maintained so that we can determine
 420          * if a sequential access pattern is taking place... if only
 421          * one thread is banging on this object, no problem with the unprotected
 422          * update... if 2 or more threads are banging away, we run the risk of
 423          * someone seeing a mangled update... however, in the face of multiple
 424          * accesses, no sequential access pattern can develop anyway, so we
 425          * haven't lost any real info.
 426          */
 427         object->last_alloc = offset;
 428 }
 429
 430
 431 int vm_page_deactivate_behind_count = 0;
 432
 433 /*
 434  * vm_page_deactivate_behind
 435  *
 436  * Determine if sequential access is in progress
 437  * in accordance with the behavior specified.  If
 438  * so, compute a potential page to deactivate and
 439  * deactivate it.
 440  *
 441  * object must be locked.
 442  *
 443  * return TRUE if we actually deactivate a page
 444  */
 445 static
 446 boolean_t
 447 vm_fault_deactivate_behind(
 448         vm_object_t             object,
 449         vm_object_offset_t      offset,
 450         vm_behavior_t           behavior)
 451 {
 452         int             n;
 453         int             pages_in_run = 0;
 454         int             max_pages_in_run = 0;
 455         int             sequential_run;
 456         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 457         vm_object_offset_t      run_offset = 0;
 458         vm_object_offset_t      pg_offset = 0;
 459         vm_page_t       m;
 460         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 461
 462         pages_in_run = 0;
 463 #if TRACEFAULTPAGE
 464         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 465 #endif
 466
 467         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 468                 /*
 469                  * Do not deactivate pages from the kernel object: they
 470                  * are not intended to become pageable.
 471                  * or we've disabled the deactivate behind mechanism
 472                  */
 473                 return FALSE;
 474         }
 475         if ((sequential_run = object->sequential)) {
 476                   if (sequential_run < 0) {
 477                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 478                           sequential_run = 0 - sequential_run;
 479                   } else {
 480                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 481                   }
 482         }
 483         switch (behavior) {
 484         case VM_BEHAVIOR_RANDOM:
 485                 break;
 486         case VM_BEHAVIOR_SEQUENTIAL:
 487                 if (sequential_run >= (int)PAGE_SIZE) {
 488                         run_offset = 0 - PAGE_SIZE_64;
 489                         max_pages_in_run = 1;
 490                 }
 491                 break;
 492         case VM_BEHAVIOR_RSEQNTL:
 493                 if (sequential_run >= (int)PAGE_SIZE) {
 494                         run_offset = PAGE_SIZE_64;
 495                         max_pages_in_run = 1;
 496                 }
 497                 break;
 498         case VM_BEHAVIOR_DEFAULT:
 499         default:
 500         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 501
 502                 /*
 503                  * determine if the run of sequential accesss has been
 504                  * long enough on an object with default access behavior
 505                  * to consider it for deactivation
 506                  */
 507                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 508                         /*
 509                          * the comparisons between offset and behind are done
 510                          * in this kind of odd fashion in order to prevent wrap around
 511                          * at the end points
 512                          */
 513                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 514                                 if (offset >= behind) {
 515                                         run_offset = 0 - behind;
 516                                         pg_offset = PAGE_SIZE_64;
 517                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 518                                 }
 519                         } else {
 520                                 if (offset < -behind) {
 521                                         run_offset = behind;
 522                                         pg_offset = 0 - PAGE_SIZE_64;
 523                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 524                                 }
 525                         }
 526                 }
 527                 break;
 528         }
 529         }
 530         for (n = 0; n < max_pages_in_run; n++) {
 531                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 532
 533                 if (m && !m->laundry && !m->busy && !m->no_cache && (m->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->fictitious && !m->absent) {
 534                         page_run[pages_in_run++] = m;
 535
 536                         /*
 537                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
 538                          *
 539                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
 540                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
 541                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
 542                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
 543                          * by pageout_scan, which is just fine since the last reference would have happened quite far
 544                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
 545                          * have happened before we did the deactivate_behind.
 546                          */
 547                         pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
 548                 }
 549         }
 550         if (pages_in_run) {
 551                 vm_page_lockspin_queues();
 552
 553                 for (n = 0; n < pages_in_run; n++) {
 554
 555                         m = page_run[n];
 556
 557                         vm_page_deactivate_internal(m, FALSE);
 558
 559                         vm_page_deactivate_behind_count++;
 560 #if TRACEFAULTPAGE
 561                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 562 #endif
 563                 }
 564                 vm_page_unlock_queues();
 565
 566                 return TRUE;
 567         }
 568         return FALSE;
 569 }
 570
 571
 572 #if (DEVELOPMENT || DEBUG)
 573 uint32_t        vm_page_creation_throttled_hard = 0;
 574 uint32_t        vm_page_creation_throttled_soft = 0;
 575 uint64_t        vm_page_creation_throttle_avoided = 0;
 576 #endif /* DEVELOPMENT || DEBUG */
 577
 578 static int
 579 vm_page_throttled(boolean_t page_kept)
 580 {
 581         clock_sec_t     elapsed_sec;
 582         clock_sec_t     tv_sec;
 583         clock_usec_t    tv_usec;
 584
 585         thread_t thread = current_thread();
 586
 587         if (thread->options & TH_OPT_VMPRIV)
 588                 return (0);
 589
 590         if (thread->t_page_creation_throttled) {
 591                 thread->t_page_creation_throttled = 0;
 592
 593                 if (page_kept == FALSE)
 594                         goto no_throttle;
 595         }
 596         if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
 597 #if (DEVELOPMENT || DEBUG)
 598                 thread->t_page_creation_throttled_hard++;
 599                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 600 #endif /* DEVELOPMENT || DEBUG */
 601                 return (HARD_THROTTLE_DELAY);
 602         }
 603
 604         if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
 605             thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
 606
 607                 if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
 608 #if (DEVELOPMENT || DEBUG)
 609                         OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
 610 #endif
 611                         goto no_throttle;
 612                 }
 613                 clock_get_system_microtime(&tv_sec, &tv_usec);
 614
 615                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 616
 617                 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
 618                     (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
 619
 620                         if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
 621                                 /*
 622                                  * we'll reset our stats to give a well behaved app
 623                                  * that was unlucky enough to accumulate a bunch of pages
 624                                  * over a long period of time a chance to get out of
 625                                  * the throttled state... we reset the counter and timestamp
 626                                  * so that if it stays under the rate limit for the next second
 627                                  * it will be back in our good graces... if it exceeds it, it
 628                                  * will remain in the throttled state
 629                                  */
 630                                 thread->t_page_creation_time = tv_sec;
 631                                 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
 632                         }
 633                         ++vm_page_throttle_count;
 634
 635                         thread->t_page_creation_throttled = 1;
 636
 637                         if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
 638 #if (DEVELOPMENT || DEBUG)
 639                                 thread->t_page_creation_throttled_hard++;
 640                                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 641 #endif /* DEVELOPMENT || DEBUG */
 642                                 return (HARD_THROTTLE_DELAY);
 643                         } else {
 644 #if (DEVELOPMENT || DEBUG)
 645                                 thread->t_page_creation_throttled_soft++;
 646                                 OSAddAtomic(1, &vm_page_creation_throttled_soft);
 647 #endif /* DEVELOPMENT || DEBUG */
 648                                 return (SOFT_THROTTLE_DELAY);
 649                         }
 650                 }
 651                 thread->t_page_creation_time = tv_sec;
 652                 thread->t_page_creation_count = 0;
 653         }
 654 no_throttle:
 655         thread->t_page_creation_count++;
 656
 657         return (0);
 658 }
 659
 660
 661 /*
 662  * check for various conditions that would
 663  * prevent us from creating a ZF page...
 664  * cleanup is based on being called from vm_fault_page
 665  *
 666  * object must be locked
 667  * object == m->object
 668  */
 669 static vm_fault_return_t
 670 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state, boolean_t page_throttle)
 671 {
 672         int throttle_delay;
 673
 674         if (object->shadow_severed ||
 675             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 676                 /*
 677                  * Either:
 678                  * 1. the shadow chain was severed,
 679                  * 2. the purgeable object is volatile or empty and is marked
 680                  *    to fault on access while volatile.
 681                  * Just have to return an error at this point
 682                  */
 683                 if (m != VM_PAGE_NULL)
 684                         VM_PAGE_FREE(m);
 685                 vm_fault_cleanup(object, first_m);
 686
 687                 thread_interrupt_level(interruptible_state);
 688
 689                 return (VM_FAULT_MEMORY_ERROR);
 690         }
 691         if (vm_backing_store_low) {
 692                 /*
 693                  * are we protecting the system from
 694                  * backing store exhaustion.  If so
 695                  * sleep unless we are privileged.
 696                  */
 697                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 698
 699                         if (m != VM_PAGE_NULL)
 700                                 VM_PAGE_FREE(m);
 701                         vm_fault_cleanup(object, first_m);
 702
 703                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 704
 705                         thread_block(THREAD_CONTINUE_NULL);
 706                         thread_interrupt_level(interruptible_state);
 707
 708                         return (VM_FAULT_RETRY);
 709                 }
 710         }
 711         if (page_throttle == TRUE) {
 712                 if ((throttle_delay = vm_page_throttled(FALSE))) {
 713                         /*
 714                          * we're throttling zero-fills...
 715                          * treat this as if we couldn't grab a page
 716                          */
 717                         if (m != VM_PAGE_NULL)
 718                                 VM_PAGE_FREE(m);
 719                         vm_fault_cleanup(object, first_m);
 720
 721                         VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 722
 723                         delay(throttle_delay);
 724
 725                         if (current_thread_aborted()) {
 726                                 thread_interrupt_level(interruptible_state);
 727                                 return VM_FAULT_INTERRUPTED;
 728                         }
 729                         thread_interrupt_level(interruptible_state);
 730
 731                         return (VM_FAULT_MEMORY_SHORTAGE);
 732                 }
 733         }
 734         return (VM_FAULT_SUCCESS);
 735 }
 736
 737
 738 /*
 739  * do the work to zero fill a page and
 740  * inject it into the correct paging queue
 741  *
 742  * m->object must be locked
 743  * page queue lock must NOT be held
 744  */
 745 static int
 746 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 747 {
 748         int my_fault = DBG_ZERO_FILL_FAULT;
 749         vm_object_t     object;
 750
 751         object = VM_PAGE_OBJECT(m);
 752
 753         /*
 754          * This is is a zero-fill page fault...
 755          *
 756          * Checking the page lock is a waste of
 757          * time;  this page was absent, so
 758          * it can't be page locked by a pager.
 759          *
 760          * we also consider it undefined
 761          * with respect to instruction
 762          * execution.  i.e. it is the responsibility
 763          * of higher layers to call for an instruction
 764          * sync after changing the contents and before
 765          * sending a program into this area.  We
 766          * choose this approach for performance
 767          */
 768         m->pmapped = TRUE;
 769
 770         m->cs_validated = FALSE;
 771         m->cs_tainted = FALSE;
 772         m->cs_nx = FALSE;
 773
 774         if (no_zero_fill == TRUE) {
 775                 my_fault = DBG_NZF_PAGE_FAULT;
 776
 777                 if (m->absent && m->busy)
 778                         return (my_fault);
 779         } else {
 780                 vm_page_zero_fill(m);
 781
 782                 VM_STAT_INCR(zero_fill_count);
 783                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 784         }
 785         assert(!m->laundry);
 786         assert(object != kernel_object);
 787         //assert(m->pageq.next == 0 && m->pageq.prev == 0);
 788
 789         if (!VM_DYNAMIC_PAGING_ENABLED() &&
 790                 (object->purgable == VM_PURGABLE_DENY ||
 791                  object->purgable == VM_PURGABLE_NONVOLATILE ||
 792                  object->purgable == VM_PURGABLE_VOLATILE )) {
 793
 794                 vm_page_lockspin_queues();
 795
 796                 if (!VM_DYNAMIC_PAGING_ENABLED()) {
 797                         assert(!VM_PAGE_WIRED(m));
 798
 799                         /*
 800                          * can't be on the pageout queue since we don't
 801                          * have a pager to try and clean to
 802                          */
 803                         vm_page_queues_remove(m, TRUE);
 804                         vm_page_check_pageable_safe(m);
 805                         vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 806                         m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
 807                         vm_page_throttled_count++;
 808                 }
 809                 vm_page_unlock_queues();
 810         }
 811         return (my_fault);
 812 }
 813
 814
 815 /*
 816  *      Routine:        vm_fault_page
 817  *      Purpose:
 818  *              Find the resident page for the virtual memory
 819  *              specified by the given virtual memory object
 820  *              and offset.
 821  *      Additional arguments:
 822  *              The required permissions for the page is given
 823  *              in "fault_type".  Desired permissions are included
 824  *              in "protection".
 825  *              fault_info is passed along to determine pagein cluster
 826  *              limits... it contains the expected reference pattern,
 827  *              cluster size if available, etc...
 828  *
 829  *              If the desired page is known to be resident (for
 830  *              example, because it was previously wired down), asserting
 831  *              the "unwiring" parameter will speed the search.
 832  *
 833  *              If the operation can be interrupted (by thread_abort
 834  *              or thread_terminate), then the "interruptible"
 835  *              parameter should be asserted.
 836  *
 837  *      Results:
 838  *              The page containing the proper data is returned
 839  *              in "result_page".
 840  *
 841  *      In/out conditions:
 842  *              The source object must be locked and referenced,
 843  *              and must donate one paging reference.  The reference
 844  *              is not affected.  The paging reference and lock are
 845  *              consumed.
 846  *
 847  *              If the call succeeds, the object in which "result_page"
 848  *              resides is left locked and holding a paging reference.
 849  *              If this is not the original object, a busy page in the
 850  *              original object is returned in "top_page", to prevent other
 851  *              callers from pursuing this same data, along with a paging
 852  *              reference for the original object.  The "top_page" should
 853  *              be destroyed when this guarantee is no longer required.
 854  *              The "result_page" is also left busy.  It is not removed
 855  *              from the pageout queues.
 856  *      Special Case:
 857  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 858  *              fault succeeded but there's no VM page (i.e. the VM object
 859  *              does not actually hold VM pages, but device memory or
 860  *              large pages).  The object is still locked and we still hold a
 861  *              paging_in_progress reference.
 862  */
 863 unsigned int vm_fault_page_blocked_access = 0;
 864 unsigned int vm_fault_page_forced_retry = 0;
 865
 866 vm_fault_return_t
 867 vm_fault_page(
 868         /* Arguments: */
 869         vm_object_t     first_object,   /* Object to begin search */
 870         vm_object_offset_t first_offset,        /* Offset into object */
 871         vm_prot_t       fault_type,     /* What access is requested */
 872         boolean_t       must_be_resident,/* Must page be resident? */
 873         boolean_t       caller_lookup,  /* caller looked up page */
 874         /* Modifies in place: */
 875         vm_prot_t       *protection,    /* Protection for mapping */
 876         vm_page_t       *result_page,   /* Page found, if successful */
 877         /* Returns: */
 878         vm_page_t       *top_page,      /* Page in top object, if
 879                                          * not result_page.  */
 880         int             *type_of_fault, /* if non-null, fill in with type of fault
 881                                          * COW, zero-fill, etc... returned in trace point */
 882         /* More arguments: */
 883         kern_return_t   *error_code,    /* code if page is in error */
 884         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 885         boolean_t       data_supply,    /* treat as data_supply if
 886                                          * it is a write fault and a full
 887                                          * page is provided */
 888         vm_object_fault_info_t fault_info)
 889 {
 890         vm_page_t               m;
 891         vm_object_t             object;
 892         vm_object_offset_t      offset;
 893         vm_page_t               first_m;
 894         vm_object_t             next_object;
 895         vm_object_t             copy_object;
 896         boolean_t               look_for_page;
 897         boolean_t               force_fault_retry = FALSE;
 898         vm_prot_t               access_required = fault_type;
 899         vm_prot_t               wants_copy_flag;
 900         CLUSTER_STAT(int pages_at_higher_offsets;)
 901         CLUSTER_STAT(int pages_at_lower_offsets;)
 902         kern_return_t           wait_result;
 903         boolean_t               interruptible_state;
 904         boolean_t               data_already_requested = FALSE;
 905         vm_behavior_t           orig_behavior;
 906         vm_size_t               orig_cluster_size;
 907         vm_fault_return_t       error;
 908         int                     my_fault;
 909         uint32_t                try_failed_count;
 910         int                     interruptible; /* how may fault be interrupted? */
 911         int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
 912         memory_object_t         pager;
 913         vm_fault_return_t       retval;
 914         int                     grab_options;
 915
 916 /*
 917  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 918  * marked as paged out in the compressor pager or the pager doesn't exist.
 919  * Note also that if the pager for an internal object
 920  * has not been created, the pager is not invoked regardless of the value
 921  * of MUST_ASK_PAGER().
 922  *
 923  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 924  * is marked as paged out in the compressor pager.
 925  * PAGED_OUT() is used to determine if a page has already been pushed
 926  * into a copy object in order to avoid a redundant page out operation.
 927  */
 928 #define MUST_ASK_PAGER(o, f, s)                                 \
 929         ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
 930
 931 #define PAGED_OUT(o, f) \
 932         (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
 933
 934 /*
 935  *      Recovery actions
 936  */
 937 #define RELEASE_PAGE(m)                                 \
 938         MACRO_BEGIN                                     \
 939         PAGE_WAKEUP_DONE(m);                            \
 940         if ( !VM_PAGE_PAGEABLE(m)) {                    \
 941                 vm_page_lockspin_queues();              \
 942                 if ( !VM_PAGE_PAGEABLE(m)) {            \
 943                         if (VM_CONFIG_COMPRESSOR_IS_ACTIVE)     \
 944                                 vm_page_deactivate(m);          \
 945                         else                                    \
 946                                 vm_page_activate(m);            \
 947                 }                                               \
 948                 vm_page_unlock_queues();                        \
 949         }                                                       \
 950         MACRO_END
 951
 952 #if TRACEFAULTPAGE
 953         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 954 #endif
 955
 956         interruptible = fault_info->interruptible;
 957         interruptible_state = thread_interrupt_level(interruptible);
 958
 959         /*
 960          *      INVARIANTS (through entire routine):
 961          *
 962          *      1)      At all times, we must either have the object
 963          *              lock or a busy page in some object to prevent
 964          *              some other thread from trying to bring in
 965          *              the same page.
 966          *
 967          *              Note that we cannot hold any locks during the
 968          *              pager access or when waiting for memory, so
 969          *              we use a busy page then.
 970          *
 971          *      2)      To prevent another thread from racing us down the
 972          *              shadow chain and entering a new page in the top
 973          *              object before we do, we must keep a busy page in
 974          *              the top object while following the shadow chain.
 975          *
 976          *      3)      We must increment paging_in_progress on any object
 977          *              for which we have a busy page before dropping
 978          *              the object lock
 979          *
 980          *      4)      We leave busy pages on the pageout queues.
 981          *              If the pageout daemon comes across a busy page,
 982          *              it will remove the page from the pageout queues.
 983          */
 984
 985         object = first_object;
 986         offset = first_offset;
 987         first_m = VM_PAGE_NULL;
 988         access_required = fault_type;
 989
 990
 991         XPR(XPR_VM_FAULT,
 992                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 993                 object, offset, fault_type, *protection, 0);
 994
 995         /*
 996          * default type of fault
 997          */
 998         my_fault = DBG_CACHE_HIT_FAULT;
 999
1000         while (TRUE) {
1001 #if TRACEFAULTPAGE
1002                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1003 #endif
1004
1005                 grab_options = 0;
1006 #if CONFIG_SECLUDED_MEMORY
1007                 if (object->can_grab_secluded) {
1008                         grab_options |= VM_PAGE_GRAB_SECLUDED;
1009                 }
1010 #endif /* CONFIG_SECLUDED_MEMORY */
1011
1012                 if (!object->alive) {
1013                         /*
1014                          * object is no longer valid
1015                          * clean up and return error
1016                          */
1017                         vm_fault_cleanup(object, first_m);
1018                         thread_interrupt_level(interruptible_state);
1019
1020                         return (VM_FAULT_MEMORY_ERROR);
1021                 }
1022
1023                 if (!object->pager_created && object->phys_contiguous) {
1024                         /*
1025                          * A physically-contiguous object without a pager:
1026                          * must be a "large page" object.  We do not deal
1027                          * with VM pages for this object.
1028                          */
1029                         caller_lookup = FALSE;
1030                         m = VM_PAGE_NULL;
1031                         goto phys_contig_object;
1032                 }
1033
1034                 if (object->blocked_access) {
1035                         /*
1036                          * Access to this VM object has been blocked.
1037                          * Replace our "paging_in_progress" reference with
1038                          * a "activity_in_progress" reference and wait for
1039                          * access to be unblocked.
1040                          */
1041                         caller_lookup = FALSE; /* no longer valid after sleep */
1042                         vm_object_activity_begin(object);
1043                         vm_object_paging_end(object);
1044                         while (object->blocked_access) {
1045                                 vm_object_sleep(object,
1046                                                 VM_OBJECT_EVENT_UNBLOCKED,
1047                                                 THREAD_UNINT);
1048                         }
1049                         vm_fault_page_blocked_access++;
1050                         vm_object_paging_begin(object);
1051                         vm_object_activity_end(object);
1052                 }
1053
1054                 /*
1055                  * See whether the page at 'offset' is resident
1056                  */
1057                 if (caller_lookup == TRUE) {
1058                         /*
1059                          * The caller has already looked up the page
1060                          * and gave us the result in "result_page".
1061                          * We can use this for the first lookup but
1062                          * it loses its validity as soon as we unlock
1063                          * the object.
1064                          */
1065                         m = *result_page;
1066                         caller_lookup = FALSE; /* no longer valid after that */
1067                 } else {
1068                         m = vm_page_lookup(object, offset);
1069                 }
1070 #if TRACEFAULTPAGE
1071                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1072 #endif
1073                 if (m != VM_PAGE_NULL) {
1074
1075                         if (m->busy) {
1076                                 /*
1077                                  * The page is being brought in,
1078                                  * wait for it and then retry.
1079                                  */
1080 #if TRACEFAULTPAGE
1081                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1082 #endif
1083                                 wait_result = PAGE_SLEEP(object, m, interruptible);
1084
1085                                 XPR(XPR_VM_FAULT,
1086                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1087                                     object, offset,
1088                                     m, 0, 0);
1089                                 counter(c_vm_fault_page_block_busy_kernel++);
1090
1091                                 if (wait_result != THREAD_AWAKENED) {
1092                                         vm_fault_cleanup(object, first_m);
1093                                         thread_interrupt_level(interruptible_state);
1094
1095                                         if (wait_result == THREAD_RESTART)
1096                                                 return (VM_FAULT_RETRY);
1097                                         else
1098                                                 return (VM_FAULT_INTERRUPTED);
1099                                 }
1100                                 continue;
1101                         }
1102                         if (m->laundry) {
1103                                 m->free_when_done = FALSE;
1104
1105                                 if (!m->cleaning)
1106                                         vm_pageout_steal_laundry(m, FALSE);
1107                         }
1108                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1109                                 /*
1110                                  * Guard page: off limits !
1111                                  */
1112                                 if (fault_type == VM_PROT_NONE) {
1113                                         /*
1114                                          * The fault is not requesting any
1115                                          * access to the guard page, so it must
1116                                          * be just to wire or unwire it.
1117                                          * Let's pretend it succeeded...
1118                                          */
1119                                         m->busy = TRUE;
1120                                         *result_page = m;
1121                                         assert(first_m == VM_PAGE_NULL);
1122                                         *top_page = first_m;
1123                                         if (type_of_fault)
1124                                                 *type_of_fault = DBG_GUARD_FAULT;
1125                                         thread_interrupt_level(interruptible_state);
1126                                         return VM_FAULT_SUCCESS;
1127                                 } else {
1128                                         /*
1129                                          * The fault requests access to the
1130                                          * guard page: let's deny that !
1131                                          */
1132                                         vm_fault_cleanup(object, first_m);
1133                                         thread_interrupt_level(interruptible_state);
1134                                         return VM_FAULT_MEMORY_ERROR;
1135                                 }
1136                         }
1137
1138                         if (m->error) {
1139                                 /*
1140                                  * The page is in error, give up now.
1141                                  */
1142 #if TRACEFAULTPAGE
1143                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1144 #endif
1145                                 if (error_code)
1146                                         *error_code = KERN_MEMORY_ERROR;
1147                                 VM_PAGE_FREE(m);
1148
1149                                 vm_fault_cleanup(object, first_m);
1150                                 thread_interrupt_level(interruptible_state);
1151
1152                                 return (VM_FAULT_MEMORY_ERROR);
1153                         }
1154                         if (m->restart) {
1155                                 /*
1156                                  * The pager wants us to restart
1157                                  * at the top of the chain,
1158                                  * typically because it has moved the
1159                                  * page to another pager, then do so.
1160                                  */
1161 #if TRACEFAULTPAGE
1162                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1163 #endif
1164                                 VM_PAGE_FREE(m);
1165
1166                                 vm_fault_cleanup(object, first_m);
1167                                 thread_interrupt_level(interruptible_state);
1168
1169                                 return (VM_FAULT_RETRY);
1170                         }
1171                         if (m->absent) {
1172                                 /*
1173                                  * The page isn't busy, but is absent,
1174                                  * therefore it's deemed "unavailable".
1175                                  *
1176                                  * Remove the non-existent page (unless it's
1177                                  * in the top object) and move on down to the
1178                                  * next object (if there is one).
1179                                  */
1180 #if TRACEFAULTPAGE
1181                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1182 #endif
1183                                 next_object = object->shadow;
1184
1185                                 if (next_object == VM_OBJECT_NULL) {
1186                                         /*
1187                                          * Absent page at bottom of shadow
1188                                          * chain; zero fill the page we left
1189                                          * busy in the first object, and free
1190                                          * the absent page.
1191                                          */
1192                                         assert(!must_be_resident);
1193
1194                                         /*
1195                                          * check for any conditions that prevent
1196                                          * us from creating a new zero-fill page
1197                                          * vm_fault_check will do all of the
1198                                          * fault cleanup in the case of an error condition
1199                                          * including resetting the thread_interrupt_level
1200                                          */
1201                                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1202
1203                                         if (error != VM_FAULT_SUCCESS)
1204                                                 return (error);
1205
1206                                         XPR(XPR_VM_FAULT,
1207                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1208                                                 object, offset,
1209                                                 m,
1210                                                 first_object, 0);
1211
1212                                         if (object != first_object) {
1213                                                 /*
1214                                                  * free the absent page we just found
1215                                                  */
1216                                                 VM_PAGE_FREE(m);
1217
1218                                                 /*
1219                                                  * drop reference and lock on current object
1220                                                  */
1221                                                 vm_object_paging_end(object);
1222                                                 vm_object_unlock(object);
1223
1224                                                 /*
1225                                                  * grab the original page we
1226                                                  * 'soldered' in place and
1227                                                  * retake lock on 'first_object'
1228                                                  */
1229                                                 m = first_m;
1230                                                 first_m = VM_PAGE_NULL;
1231
1232                                                 object = first_object;
1233                                                 offset = first_offset;
1234
1235                                                 vm_object_lock(object);
1236                                         } else {
1237                                                 /*
1238                                                  * we're going to use the absent page we just found
1239                                                  * so convert it to a 'busy' page
1240                                                  */
1241                                                 m->absent = FALSE;
1242                                                 m->busy = TRUE;
1243                                         }
1244                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1245                                                 m->absent = TRUE;
1246                                         /*
1247                                          * zero-fill the page and put it on
1248                                          * the correct paging queue
1249                                          */
1250                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1251
1252                                         break;
1253                                 } else {
1254                                         if (must_be_resident)
1255                                                 vm_object_paging_end(object);
1256                                         else if (object != first_object) {
1257                                                 vm_object_paging_end(object);
1258                                                 VM_PAGE_FREE(m);
1259                                         } else {
1260                                                 first_m = m;
1261                                                 m->absent = FALSE;
1262                                                 m->busy = TRUE;
1263
1264                                                 vm_page_lockspin_queues();
1265                                                 vm_page_queues_remove(m, FALSE);
1266                                                 vm_page_unlock_queues();
1267                                         }
1268                                         XPR(XPR_VM_FAULT,
1269                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1270                                                 object, offset,
1271                                                 next_object,
1272                                                 offset+object->vo_shadow_offset,0);
1273
1274                                         offset += object->vo_shadow_offset;
1275                                         fault_info->lo_offset += object->vo_shadow_offset;
1276                                         fault_info->hi_offset += object->vo_shadow_offset;
1277                                         access_required = VM_PROT_READ;
1278
1279                                         vm_object_lock(next_object);
1280                                         vm_object_unlock(object);
1281                                         object = next_object;
1282                                         vm_object_paging_begin(object);
1283
1284                                         /*
1285                                          * reset to default type of fault
1286                                          */
1287                                         my_fault = DBG_CACHE_HIT_FAULT;
1288
1289                                         continue;
1290                                 }
1291                         }
1292                         if ((m->cleaning)
1293                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1294                             && (fault_type & VM_PROT_WRITE)) {
1295                                 /*
1296                                  * This is a copy-on-write fault that will
1297                                  * cause us to revoke access to this page, but
1298                                  * this page is in the process of being cleaned
1299                                  * in a clustered pageout. We must wait until
1300                                  * the cleaning operation completes before
1301                                  * revoking access to the original page,
1302                                  * otherwise we might attempt to remove a
1303                                  * wired mapping.
1304                                  */
1305 #if TRACEFAULTPAGE
1306                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1307 #endif
1308                                 XPR(XPR_VM_FAULT,
1309                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1310                                         object, offset,
1311                                         m, 0, 0);
1312                                 /*
1313                                  * take an extra ref so that object won't die
1314                                  */
1315                                 vm_object_reference_locked(object);
1316
1317                                 vm_fault_cleanup(object, first_m);
1318
1319                                 counter(c_vm_fault_page_block_backoff_kernel++);
1320                                 vm_object_lock(object);
1321                                 assert(object->ref_count > 0);
1322
1323                                 m = vm_page_lookup(object, offset);
1324
1325                                 if (m != VM_PAGE_NULL && m->cleaning) {
1326                                         PAGE_ASSERT_WAIT(m, interruptible);
1327
1328                                         vm_object_unlock(object);
1329                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1330                                         vm_object_deallocate(object);
1331
1332                                         goto backoff;
1333                                 } else {
1334                                         vm_object_unlock(object);
1335
1336                                         vm_object_deallocate(object);
1337                                         thread_interrupt_level(interruptible_state);
1338
1339                                         return (VM_FAULT_RETRY);
1340                                 }
1341                         }
1342                         if (type_of_fault == NULL && (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1343                             !(fault_info != NULL && fault_info->stealth)) {
1344                                 /*
1345                                  * If we were passed a non-NULL pointer for
1346                                  * "type_of_fault", than we came from
1347                                  * vm_fault... we'll let it deal with
1348                                  * this condition, since it
1349                                  * needs to see m->speculative to correctly
1350                                  * account the pageins, otherwise...
1351                                  * take it off the speculative queue, we'll
1352                                  * let the caller of vm_fault_page deal
1353                                  * with getting it onto the correct queue
1354                                  *
1355                                  * If the caller specified in fault_info that
1356                                  * it wants a "stealth" fault, we also leave
1357                                  * the page in the speculative queue.
1358                                  */
1359                                 vm_page_lockspin_queues();
1360                                 if (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q)
1361                                         vm_page_queues_remove(m, FALSE);
1362                                 vm_page_unlock_queues();
1363                         }
1364                         assert(object == VM_PAGE_OBJECT(m));
1365
1366                         if (object->code_signed) {
1367                                 /*
1368                                  * CODE SIGNING:
1369                                  * We just paged in a page from a signed
1370                                  * memory object but we don't need to
1371                                  * validate it now.  We'll validate it if
1372                                  * when it gets mapped into a user address
1373                                  * space for the first time or when the page
1374                                  * gets copied to another object as a result
1375                                  * of a copy-on-write.
1376                                  */
1377                         }
1378
1379                         /*
1380                          * We mark the page busy and leave it on
1381                          * the pageout queues.  If the pageout
1382                          * deamon comes across it, then it will
1383                          * remove the page from the queue, but not the object
1384                          */
1385 #if TRACEFAULTPAGE
1386                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1387 #endif
1388                         XPR(XPR_VM_FAULT,
1389                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1390                                 object, offset, m, 0, 0);
1391                         assert(!m->busy);
1392                         assert(!m->absent);
1393
1394                         m->busy = TRUE;
1395                         break;
1396                 }
1397
1398
1399                 /*
1400                  * we get here when there is no page present in the object at
1401                  * the offset we're interested in... we'll allocate a page
1402                  * at this point if the pager associated with
1403                  * this object can provide the data or we're the top object...
1404                  * object is locked;  m == NULL
1405                  */
1406
1407                 if (must_be_resident) {
1408                         if (fault_type == VM_PROT_NONE &&
1409                             object == kernel_object) {
1410                                 /*
1411                                  * We've been called from vm_fault_unwire()
1412                                  * while removing a map entry that was allocated
1413                                  * with KMA_KOBJECT and KMA_VAONLY.  This page
1414                                  * is not present and there's nothing more to
1415                                  * do here (nothing to unwire).
1416                                  */
1417                                 vm_fault_cleanup(object, first_m);
1418                                 thread_interrupt_level(interruptible_state);
1419
1420                                 return VM_FAULT_MEMORY_ERROR;
1421                         }
1422
1423                         goto dont_look_for_page;
1424                 }
1425
1426                 /* Don't expect to fault pages into the kernel object. */
1427                 assert(object != kernel_object);
1428
1429                 data_supply = FALSE;
1430
1431                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1432
1433 #if TRACEFAULTPAGE
1434                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1435 #endif
1436                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1437                         /*
1438                          * Allocate a new page for this object/offset pair as a placeholder
1439                          */
1440                         m = vm_page_grab_options(grab_options);
1441 #if TRACEFAULTPAGE
1442                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1443 #endif
1444                         if (m == VM_PAGE_NULL) {
1445
1446                                 vm_fault_cleanup(object, first_m);
1447                                 thread_interrupt_level(interruptible_state);
1448
1449                                 return (VM_FAULT_MEMORY_SHORTAGE);
1450                         }
1451
1452                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1453                                 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1454                         } else {
1455                                 vm_page_insert(m, object, offset);
1456                         }
1457                 }
1458                 if (look_for_page) {
1459                         kern_return_t   rc;
1460                         int             my_fault_type;
1461
1462                         /*
1463                          *      If the memory manager is not ready, we
1464                          *      cannot make requests.
1465                          */
1466                         if (!object->pager_ready) {
1467 #if TRACEFAULTPAGE
1468                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1469 #endif
1470                                 if (m != VM_PAGE_NULL)
1471                                         VM_PAGE_FREE(m);
1472
1473                                 XPR(XPR_VM_FAULT,
1474                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1475                                         object, offset, 0, 0, 0);
1476
1477                                 /*
1478                                  * take an extra ref so object won't die
1479                                  */
1480                                 vm_object_reference_locked(object);
1481                                 vm_fault_cleanup(object, first_m);
1482                                 counter(c_vm_fault_page_block_backoff_kernel++);
1483
1484                                 vm_object_lock(object);
1485                                 assert(object->ref_count > 0);
1486
1487                                 if (!object->pager_ready) {
1488                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1489
1490                                         vm_object_unlock(object);
1491                                         if (wait_result == THREAD_WAITING)
1492                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1493                                         vm_object_deallocate(object);
1494
1495                                         goto backoff;
1496                                 } else {
1497                                         vm_object_unlock(object);
1498                                         vm_object_deallocate(object);
1499                                         thread_interrupt_level(interruptible_state);
1500
1501                                         return (VM_FAULT_RETRY);
1502                                 }
1503                         }
1504                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1505                                 /*
1506                                  * If there are too many outstanding page
1507                                  * requests pending on this external object, we
1508                                  * wait for them to be resolved now.
1509                                  */
1510 #if TRACEFAULTPAGE
1511                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1512 #endif
1513                                 if (m != VM_PAGE_NULL)
1514                                         VM_PAGE_FREE(m);
1515                                 /*
1516                                  * take an extra ref so object won't die
1517                                  */
1518                                 vm_object_reference_locked(object);
1519
1520                                 vm_fault_cleanup(object, first_m);
1521
1522                                 counter(c_vm_fault_page_block_backoff_kernel++);
1523
1524                                 vm_object_lock(object);
1525                                 assert(object->ref_count > 0);
1526
1527                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1528                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1529
1530                                         vm_object_unlock(object);
1531                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1532                                         vm_object_deallocate(object);
1533
1534                                         goto backoff;
1535                                 } else {
1536                                         vm_object_unlock(object);
1537                                         vm_object_deallocate(object);
1538                                         thread_interrupt_level(interruptible_state);
1539
1540                                         return (VM_FAULT_RETRY);
1541                                 }
1542                         }
1543                         if (object->internal) {
1544                                 int compressed_count_delta;
1545
1546                                 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1547
1548                                 if (m == VM_PAGE_NULL) {
1549                                         /*
1550                                          * Allocate a new page for this object/offset pair as a placeholder
1551                                          */
1552                                         m = vm_page_grab_options(grab_options);
1553 #if TRACEFAULTPAGE
1554                                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1555 #endif
1556                                         if (m == VM_PAGE_NULL) {
1557
1558                                                 vm_fault_cleanup(object, first_m);
1559                                                 thread_interrupt_level(interruptible_state);
1560
1561                                                 return (VM_FAULT_MEMORY_SHORTAGE);
1562                                         }
1563
1564                                         m->absent = TRUE;
1565                                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1566                                                 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1567                                         } else {
1568                                                 vm_page_insert(m, object, offset);
1569                                         }
1570                                 }
1571                                 assert(m->busy);
1572
1573                                 m->absent = TRUE;
1574                                 pager = object->pager;
1575
1576                                 assert(object->paging_in_progress > 0);
1577                                 vm_object_unlock(object);
1578
1579                                 rc = vm_compressor_pager_get(
1580                                         pager,
1581                                         offset + object->paging_offset,
1582                                         VM_PAGE_GET_PHYS_PAGE(m),
1583                                         &my_fault_type,
1584                                         0,
1585                                         &compressed_count_delta);
1586
1587                                 if (type_of_fault == NULL) {
1588                                         int     throttle_delay;
1589
1590                                         /*
1591                                          * we weren't called from vm_fault, so we
1592                                          * need to apply page creation throttling
1593                                          * do it before we re-acquire any locks
1594                                          */
1595                                         if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1596                                                 if ((throttle_delay = vm_page_throttled(TRUE))) {
1597                                                         VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1598                                                         delay(throttle_delay);
1599                                                 }
1600                                         }
1601                                 }
1602                                 vm_object_lock(object);
1603                                 assert(object->paging_in_progress > 0);
1604
1605                                 vm_compressor_pager_count(
1606                                         pager,
1607                                         compressed_count_delta,
1608                                         FALSE, /* shared_lock */
1609                                         object);
1610
1611                                 switch (rc) {
1612                                 case KERN_SUCCESS:
1613                                         m->absent = FALSE;
1614                                         m->dirty = TRUE;
1615                                         if ((object->wimg_bits &
1616                                              VM_WIMG_MASK) !=
1617                                             VM_WIMG_USE_DEFAULT) {
1618                                                 /*
1619                                                  * If the page is not cacheable,
1620                                                  * we can't let its contents
1621                                                  * linger in the data cache
1622                                                  * after the decompression.
1623                                                  */
1624                                                 pmap_sync_page_attributes_phys(
1625                                                         VM_PAGE_GET_PHYS_PAGE(m));
1626                                         } else {
1627                                                 m->written_by_kernel = TRUE;
1628                                         }
1629
1630                                         /*
1631                                          * If the object is purgeable, its
1632                                          * owner's purgeable ledgers have been
1633                                          * updated in vm_page_insert() but the
1634                                          * page was also accounted for in a
1635                                          * "compressed purgeable" ledger, so
1636                                          * update that now.
1637                                          */
1638                                         if ((object->purgable !=
1639                                              VM_PURGABLE_DENY) &&
1640                                             (object->vo_purgeable_owner !=
1641                                              NULL)) {
1642                                                 /*
1643                                                  * One less compressed
1644                                                  * purgeable page.
1645                                                  */
1646                                                 vm_purgeable_compressed_update(
1647                                                         object,
1648                                                         -1);
1649                                         }
1650
1651                                         break;
1652                                 case KERN_MEMORY_FAILURE:
1653                                         m->unusual = TRUE;
1654                                         m->error = TRUE;
1655                                         m->absent = FALSE;
1656                                         break;
1657                                 case KERN_MEMORY_ERROR:
1658                                         assert(m->absent);
1659                                         break;
1660                                 default:
1661                                         panic("vm_fault_page(): unexpected "
1662                                               "error %d from "
1663                                               "vm_compressor_pager_get()\n",
1664                                               rc);
1665                                 }
1666                                 PAGE_WAKEUP_DONE(m);
1667
1668                                 rc = KERN_SUCCESS;
1669                                 goto data_requested;
1670                         }
1671                         my_fault_type = DBG_PAGEIN_FAULT;
1672
1673                         if (m != VM_PAGE_NULL) {
1674                                 VM_PAGE_FREE(m);
1675                                 m = VM_PAGE_NULL;
1676                         }
1677
1678 #if TRACEFAULTPAGE
1679                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1680 #endif
1681
1682                         /*
1683                          * It's possible someone called vm_object_destroy while we weren't
1684                          * holding the object lock.  If that has happened, then bail out
1685                          * here.
1686                          */
1687
1688                         pager = object->pager;
1689
1690                         if (pager == MEMORY_OBJECT_NULL) {
1691                                 vm_fault_cleanup(object, first_m);
1692                                 thread_interrupt_level(interruptible_state);
1693                                 return VM_FAULT_MEMORY_ERROR;
1694                         }
1695
1696                         /*
1697                          * We have an absent page in place for the faulting offset,
1698                          * so we can release the object lock.
1699                          */
1700
1701                         if (object->object_slid == TRUE) {
1702                                 set_thread_rwlock_boost();
1703                         }
1704
1705                         vm_object_unlock(object);
1706
1707                         /*
1708                          * If this object uses a copy_call strategy,
1709                          * and we are interested in a copy of this object
1710                          * (having gotten here only by following a
1711                          * shadow chain), then tell the memory manager
1712                          * via a flag added to the desired_access
1713                          * parameter, so that it can detect a race
1714                          * between our walking down the shadow chain
1715                          * and its pushing pages up into a copy of
1716                          * the object that it manages.
1717                          */
1718                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1719                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1720                         else
1721                                 wants_copy_flag = VM_PROT_NONE;
1722
1723                         XPR(XPR_VM_FAULT,
1724                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1725                                 object, offset, m,
1726                                 access_required | wants_copy_flag, 0);
1727
1728                         if (object->copy == first_object) {
1729                                 /*
1730                                  * if we issue the memory_object_data_request in
1731                                  * this state, we are subject to a deadlock with
1732                                  * the underlying filesystem if it is trying to
1733                                  * shrink the file resulting in a push of pages
1734                                  * into the copy object...  that push will stall
1735                                  * on the placeholder page, and if the pushing thread
1736                                  * is holding a lock that is required on the pagein
1737                                  * path (such as a truncate lock), we'll deadlock...
1738                                  * to avoid this potential deadlock, we throw away
1739                                  * our placeholder page before calling memory_object_data_request
1740                                  * and force this thread to retry the vm_fault_page after
1741                                  * we have issued the I/O.  the second time through this path
1742                                  * we will find the page already in the cache (presumably still
1743                                  * busy waiting for the I/O to complete) and then complete
1744                                  * the fault w/o having to go through memory_object_data_request again
1745                                  */
1746                                 assert(first_m != VM_PAGE_NULL);
1747                                 assert(VM_PAGE_OBJECT(first_m) == first_object);
1748
1749                                 vm_object_lock(first_object);
1750                                 VM_PAGE_FREE(first_m);
1751                                 vm_object_paging_end(first_object);
1752                                 vm_object_unlock(first_object);
1753
1754                                 first_m = VM_PAGE_NULL;
1755                                 force_fault_retry = TRUE;
1756
1757                                 vm_fault_page_forced_retry++;
1758                         }
1759
1760                         if (data_already_requested == TRUE) {
1761                                 orig_behavior = fault_info->behavior;
1762                                 orig_cluster_size = fault_info->cluster_size;
1763
1764                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1765                                 fault_info->cluster_size = PAGE_SIZE;
1766                         }
1767                         /*
1768                          * Call the memory manager to retrieve the data.
1769                          */
1770                         rc = memory_object_data_request(
1771                                 pager,
1772                                 offset + object->paging_offset,
1773                                 PAGE_SIZE,
1774                                 access_required | wants_copy_flag,
1775                                 (memory_object_fault_info_t)fault_info);
1776
1777                         if (data_already_requested == TRUE) {
1778                                 fault_info->behavior = orig_behavior;
1779                                 fault_info->cluster_size = orig_cluster_size;
1780                         } else
1781                                 data_already_requested = TRUE;
1782
1783                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1784 #if TRACEFAULTPAGE
1785                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1786 #endif
1787                         vm_object_lock(object);
1788
1789                         if (object->object_slid == TRUE) {
1790                                 clear_thread_rwlock_boost();
1791                         }
1792
1793                 data_requested:
1794                         if (rc != KERN_SUCCESS) {
1795
1796                                 vm_fault_cleanup(object, first_m);
1797                                 thread_interrupt_level(interruptible_state);
1798
1799                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1800                                         VM_FAULT_INTERRUPTED :
1801                                         VM_FAULT_MEMORY_ERROR);
1802                         } else {
1803                                 clock_sec_t     tv_sec;
1804                                 clock_usec_t    tv_usec;
1805
1806                                 if (my_fault_type == DBG_PAGEIN_FAULT) {
1807                                         clock_get_system_microtime(&tv_sec, &tv_usec);
1808                                         current_thread()->t_page_creation_time = tv_sec;
1809                                         current_thread()->t_page_creation_count = 0;
1810                                 }
1811                         }
1812                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1813
1814                                 vm_fault_cleanup(object, first_m);
1815                                 thread_interrupt_level(interruptible_state);
1816
1817                                 return (VM_FAULT_INTERRUPTED);
1818                         }
1819                         if (force_fault_retry == TRUE) {
1820
1821                                 vm_fault_cleanup(object, first_m);
1822                                 thread_interrupt_level(interruptible_state);
1823
1824                                 return (VM_FAULT_RETRY);
1825                         }
1826                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1827                                 /*
1828                                  * No page here means that the object we
1829                                  * initially looked up was "physically
1830                                  * contiguous" (i.e. device memory).  However,
1831                                  * with Virtual VRAM, the object might not
1832                                  * be backed by that device memory anymore,
1833                                  * so we're done here only if the object is
1834                                  * still "phys_contiguous".
1835                                  * Otherwise, if the object is no longer
1836                                  * "phys_contiguous", we need to retry the
1837                                  * page fault against the object's new backing
1838                                  * store (different memory object).
1839                                  */
1840                         phys_contig_object:
1841                                 goto done;
1842                         }
1843                         /*
1844                          * potentially a pagein fault
1845                          * if we make it through the state checks
1846                          * above, than we'll count it as such
1847                          */
1848                         my_fault = my_fault_type;
1849
1850                         /*
1851                          * Retry with same object/offset, since new data may
1852                          * be in a different page (i.e., m is meaningless at
1853                          * this point).
1854                          */
1855                         continue;
1856                 }
1857 dont_look_for_page:
1858                 /*
1859                  * We get here if the object has no pager, or an existence map
1860                  * exists and indicates the page isn't present on the pager
1861                  * or we're unwiring a page.  If a pager exists, but there
1862                  * is no existence map, then the m->absent case above handles
1863                  * the ZF case when the pager can't provide the page
1864                  */
1865 #if TRACEFAULTPAGE
1866                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1867 #endif
1868                 if (object == first_object)
1869                         first_m = m;
1870                 else
1871                         assert(m == VM_PAGE_NULL);
1872
1873                 XPR(XPR_VM_FAULT,
1874                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1875                         object, offset, m,
1876                         object->shadow, 0);
1877
1878                 next_object = object->shadow;
1879
1880                 if (next_object == VM_OBJECT_NULL) {
1881                         /*
1882                          * we've hit the bottom of the shadown chain,
1883                          * fill the page in the top object with zeros.
1884                          */
1885                         assert(!must_be_resident);
1886
1887                         if (object != first_object) {
1888                                 vm_object_paging_end(object);
1889                                 vm_object_unlock(object);
1890
1891                                 object = first_object;
1892                                 offset = first_offset;
1893                                 vm_object_lock(object);
1894                         }
1895                         m = first_m;
1896                         assert(VM_PAGE_OBJECT(m) == object);
1897                         first_m = VM_PAGE_NULL;
1898
1899                         /*
1900                          * check for any conditions that prevent
1901                          * us from creating a new zero-fill page
1902                          * vm_fault_check will do all of the
1903                          * fault cleanup in the case of an error condition
1904                          * including resetting the thread_interrupt_level
1905                          */
1906                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1907
1908                         if (error != VM_FAULT_SUCCESS)
1909                                 return (error);
1910
1911                         if (m == VM_PAGE_NULL) {
1912                                 m = vm_page_grab_options(grab_options);
1913
1914                                 if (m == VM_PAGE_NULL) {
1915                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1916                                         thread_interrupt_level(interruptible_state);
1917
1918                                         return (VM_FAULT_MEMORY_SHORTAGE);
1919                                 }
1920                                 vm_page_insert(m, object, offset);
1921                         }
1922                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1923                                 m->absent = TRUE;
1924
1925                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1926
1927                         break;
1928
1929                 } else {
1930                         /*
1931                          * Move on to the next object.  Lock the next
1932                          * object before unlocking the current one.
1933                          */
1934                         if ((object != first_object) || must_be_resident)
1935                                 vm_object_paging_end(object);
1936
1937                         offset += object->vo_shadow_offset;
1938                         fault_info->lo_offset += object->vo_shadow_offset;
1939                         fault_info->hi_offset += object->vo_shadow_offset;
1940                         access_required = VM_PROT_READ;
1941
1942                         vm_object_lock(next_object);
1943                         vm_object_unlock(object);
1944
1945                         object = next_object;
1946                         vm_object_paging_begin(object);
1947                 }
1948         }
1949
1950         /*
1951          *      PAGE HAS BEEN FOUND.
1952          *
1953          *      This page (m) is:
1954          *              busy, so that we can play with it;
1955          *              not absent, so that nobody else will fill it;
1956          *              possibly eligible for pageout;
1957          *
1958          *      The top-level page (first_m) is:
1959          *              VM_PAGE_NULL if the page was found in the
1960          *               top-level object;
1961          *              busy, not absent, and ineligible for pageout.
1962          *
1963          *      The current object (object) is locked.  A paging
1964          *      reference is held for the current and top-level
1965          *      objects.
1966          */
1967
1968 #if TRACEFAULTPAGE
1969         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1970 #endif
1971 #if     EXTRA_ASSERTIONS
1972         assert(m->busy && !m->absent);
1973         assert((first_m == VM_PAGE_NULL) ||
1974                (first_m->busy && !first_m->absent &&
1975                 !first_m->active && !first_m->inactive && !first_m->secluded));
1976 #endif  /* EXTRA_ASSERTIONS */
1977
1978         XPR(XPR_VM_FAULT,
1979             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1980                 object, offset, m,
1981                 first_object, first_m);
1982
1983         /*
1984          * If the page is being written, but isn't
1985          * already owned by the top-level object,
1986          * we have to copy it into a new page owned
1987          * by the top-level object.
1988          */
1989         if (object != first_object) {
1990
1991 #if TRACEFAULTPAGE
1992                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1993 #endif
1994                 if (fault_type & VM_PROT_WRITE) {
1995                         vm_page_t copy_m;
1996
1997                         /*
1998                          * We only really need to copy if we
1999                          * want to write it.
2000                          */
2001                         assert(!must_be_resident);
2002
2003                         /*
2004                          * are we protecting the system from
2005                          * backing store exhaustion.  If so
2006                          * sleep unless we are privileged.
2007                          */
2008                         if (vm_backing_store_low) {
2009                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2010
2011                                         RELEASE_PAGE(m);
2012                                         vm_fault_cleanup(object, first_m);
2013
2014                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2015
2016                                         thread_block(THREAD_CONTINUE_NULL);
2017                                         thread_interrupt_level(interruptible_state);
2018
2019                                         return (VM_FAULT_RETRY);
2020                                 }
2021                         }
2022                         /*
2023                          * If we try to collapse first_object at this
2024                          * point, we may deadlock when we try to get
2025                          * the lock on an intermediate object (since we
2026                          * have the bottom object locked).  We can't
2027                          * unlock the bottom object, because the page
2028                          * we found may move (by collapse) if we do.
2029                          *
2030                          * Instead, we first copy the page.  Then, when
2031                          * we have no more use for the bottom object,
2032                          * we unlock it and try to collapse.
2033                          *
2034                          * Note that we copy the page even if we didn't
2035                          * need to... that's the breaks.
2036                          */
2037
2038                         /*
2039                          * Allocate a page for the copy
2040                          */
2041                         copy_m = vm_page_grab_options(grab_options);
2042
2043                         if (copy_m == VM_PAGE_NULL) {
2044                                 RELEASE_PAGE(m);
2045
2046                                 vm_fault_cleanup(object, first_m);
2047                                 thread_interrupt_level(interruptible_state);
2048
2049                                 return (VM_FAULT_MEMORY_SHORTAGE);
2050                         }
2051                         XPR(XPR_VM_FAULT,
2052                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
2053                                 object, offset,
2054                                 m, copy_m, 0);
2055
2056                         vm_page_copy(m, copy_m);
2057
2058                         /*
2059                          * If another map is truly sharing this
2060                          * page with us, we have to flush all
2061                          * uses of the original page, since we
2062                          * can't distinguish those which want the
2063                          * original from those which need the
2064                          * new copy.
2065                          *
2066                          * XXXO If we know that only one map has
2067                          * access to this page, then we could
2068                          * avoid the pmap_disconnect() call.
2069                          */
2070                         if (m->pmapped)
2071                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2072
2073                         if (m->clustered) {
2074                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2075                                 VM_PAGE_CONSUME_CLUSTERED(m);
2076                         }
2077                         assert(!m->cleaning);
2078
2079                         /*
2080                          * We no longer need the old page or object.
2081                          */
2082                         RELEASE_PAGE(m);
2083
2084                         /*
2085                          * This check helps with marking the object as having a sequential pattern
2086                          * Normally we'll miss doing this below because this fault is about COW to
2087                          * the first_object i.e. bring page in from disk, push to object above but
2088                          * don't update the file object's sequential pattern.
2089                          */
2090                         if (object->internal == FALSE) {
2091                                 vm_fault_is_sequential(object, offset, fault_info->behavior);
2092                         }
2093
2094                         vm_object_paging_end(object);
2095                         vm_object_unlock(object);
2096
2097                         my_fault = DBG_COW_FAULT;
2098                         VM_STAT_INCR(cow_faults);
2099                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2100                         current_task()->cow_faults++;
2101
2102                         object = first_object;
2103                         offset = first_offset;
2104
2105                         vm_object_lock(object);
2106                         /*
2107                          * get rid of the place holder
2108                          * page that we soldered in earlier
2109                          */
2110                         VM_PAGE_FREE(first_m);
2111                         first_m = VM_PAGE_NULL;
2112
2113                         /*
2114                          * and replace it with the
2115                          * page we just copied into
2116                          */
2117                         assert(copy_m->busy);
2118                         vm_page_insert(copy_m, object, offset);
2119                         SET_PAGE_DIRTY(copy_m, TRUE);
2120
2121                         m = copy_m;
2122                         /*
2123                          * Now that we've gotten the copy out of the
2124                          * way, let's try to collapse the top object.
2125                          * But we have to play ugly games with
2126                          * paging_in_progress to do that...
2127                          */
2128                         vm_object_paging_end(object);
2129                         vm_object_collapse(object, offset, TRUE);
2130                         vm_object_paging_begin(object);
2131
2132                 } else
2133                         *protection &= (~VM_PROT_WRITE);
2134         }
2135         /*
2136          * Now check whether the page needs to be pushed into the
2137          * copy object.  The use of asymmetric copy on write for
2138          * shared temporary objects means that we may do two copies to
2139          * satisfy the fault; one above to get the page from a
2140          * shadowed object, and one here to push it into the copy.
2141          */
2142         try_failed_count = 0;
2143
2144         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2145                 vm_object_offset_t      copy_offset;
2146                 vm_page_t               copy_m;
2147
2148 #if TRACEFAULTPAGE
2149                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2150 #endif
2151                 /*
2152                  * If the page is being written, but hasn't been
2153                  * copied to the copy-object, we have to copy it there.
2154                  */
2155                 if ((fault_type & VM_PROT_WRITE) == 0) {
2156                         *protection &= ~VM_PROT_WRITE;
2157                         break;
2158                 }
2159
2160                 /*
2161                  * If the page was guaranteed to be resident,
2162                  * we must have already performed the copy.
2163                  */
2164                 if (must_be_resident)
2165                         break;
2166
2167                 /*
2168                  * Try to get the lock on the copy_object.
2169                  */
2170                 if (!vm_object_lock_try(copy_object)) {
2171
2172                         vm_object_unlock(object);
2173                         try_failed_count++;
2174
2175                         mutex_pause(try_failed_count);  /* wait a bit */
2176                         vm_object_lock(object);
2177
2178                         continue;
2179                 }
2180                 try_failed_count = 0;
2181
2182                 /*
2183                  * Make another reference to the copy-object,
2184                  * to keep it from disappearing during the
2185                  * copy.
2186                  */
2187                 vm_object_reference_locked(copy_object);
2188
2189                 /*
2190                  * Does the page exist in the copy?
2191                  */
2192                 copy_offset = first_offset - copy_object->vo_shadow_offset;
2193
2194                 if (copy_object->vo_size <= copy_offset)
2195                         /*
2196                          * Copy object doesn't cover this page -- do nothing.
2197                          */
2198                         ;
2199                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2200                         /*
2201                          * Page currently exists in the copy object
2202                          */
2203                         if (copy_m->busy) {
2204                                 /*
2205                                  * If the page is being brought
2206                                  * in, wait for it and then retry.
2207                                  */
2208                                 RELEASE_PAGE(m);
2209
2210                                 /*
2211                                  * take an extra ref so object won't die
2212                                  */
2213                                 vm_object_reference_locked(copy_object);
2214                                 vm_object_unlock(copy_object);
2215                                 vm_fault_cleanup(object, first_m);
2216                                 counter(c_vm_fault_page_block_backoff_kernel++);
2217
2218                                 vm_object_lock(copy_object);
2219                                 assert(copy_object->ref_count > 0);
2220                                 VM_OBJ_RES_DECR(copy_object);
2221                                 vm_object_lock_assert_exclusive(copy_object);
2222                                 copy_object->ref_count--;
2223                                 assert(copy_object->ref_count > 0);
2224                                 copy_m = vm_page_lookup(copy_object, copy_offset);
2225
2226                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
2227                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
2228
2229                                         vm_object_unlock(copy_object);
2230                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2231                                         vm_object_deallocate(copy_object);
2232
2233                                         goto backoff;
2234                                 } else {
2235                                         vm_object_unlock(copy_object);
2236                                         vm_object_deallocate(copy_object);
2237                                         thread_interrupt_level(interruptible_state);
2238
2239                                         return (VM_FAULT_RETRY);
2240                                 }
2241                         }
2242                 }
2243                 else if (!PAGED_OUT(copy_object, copy_offset)) {
2244                         /*
2245                          * If PAGED_OUT is TRUE, then the page used to exist
2246                          * in the copy-object, and has already been paged out.
2247                          * We don't need to repeat this. If PAGED_OUT is
2248                          * FALSE, then either we don't know (!pager_created,
2249                          * for example) or it hasn't been paged out.
2250                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2251                          * We must copy the page to the copy object.
2252                          */
2253
2254                         if (vm_backing_store_low) {
2255                                 /*
2256                                  * we are protecting the system from
2257                                  * backing store exhaustion.  If so
2258                                  * sleep unless we are privileged.
2259                                  */
2260                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2261                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2262
2263                                         RELEASE_PAGE(m);
2264                                         VM_OBJ_RES_DECR(copy_object);
2265                                         vm_object_lock_assert_exclusive(copy_object);
2266                                         copy_object->ref_count--;
2267                                         assert(copy_object->ref_count > 0);
2268
2269                                         vm_object_unlock(copy_object);
2270                                         vm_fault_cleanup(object, first_m);
2271                                         thread_block(THREAD_CONTINUE_NULL);
2272                                         thread_interrupt_level(interruptible_state);
2273
2274                                         return (VM_FAULT_RETRY);
2275                                 }
2276                         }
2277                         /*
2278                          * Allocate a page for the copy
2279                          */
2280                         copy_m = vm_page_alloc(copy_object, copy_offset);
2281
2282                         if (copy_m == VM_PAGE_NULL) {
2283                                 RELEASE_PAGE(m);
2284
2285                                 VM_OBJ_RES_DECR(copy_object);
2286                                 vm_object_lock_assert_exclusive(copy_object);
2287                                 copy_object->ref_count--;
2288                                 assert(copy_object->ref_count > 0);
2289
2290                                 vm_object_unlock(copy_object);
2291                                 vm_fault_cleanup(object, first_m);
2292                                 thread_interrupt_level(interruptible_state);
2293
2294                                 return (VM_FAULT_MEMORY_SHORTAGE);
2295                         }
2296                         /*
2297                          * Must copy page into copy-object.
2298                          */
2299                         vm_page_copy(m, copy_m);
2300
2301                         /*
2302                          * If the old page was in use by any users
2303                          * of the copy-object, it must be removed
2304                          * from all pmaps.  (We can't know which
2305                          * pmaps use it.)
2306                          */
2307                         if (m->pmapped)
2308                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2309
2310                         if (m->clustered) {
2311                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2312                                 VM_PAGE_CONSUME_CLUSTERED(m);
2313                         }
2314                         /*
2315                          * If there's a pager, then immediately
2316                          * page out this page, using the "initialize"
2317                          * option.  Else, we use the copy.
2318                          */
2319                         if ((!copy_object->pager_ready)
2320                             || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2321                            ) {
2322
2323                                 vm_page_lockspin_queues();
2324                                 assert(!m->cleaning);
2325                                 vm_page_activate(copy_m);
2326                                 vm_page_unlock_queues();
2327
2328                                 SET_PAGE_DIRTY(copy_m, TRUE);
2329                                 PAGE_WAKEUP_DONE(copy_m);
2330
2331                         } else {
2332
2333                                 assert(copy_m->busy == TRUE);
2334                                 assert(!m->cleaning);
2335
2336                                 /*
2337                                  * dirty is protected by the object lock
2338                                  */
2339                                 SET_PAGE_DIRTY(copy_m, TRUE);
2340
2341                                 /*
2342                                  * The page is already ready for pageout:
2343                                  * not on pageout queues and busy.
2344                                  * Unlock everything except the
2345                                  * copy_object itself.
2346                                  */
2347                                 vm_object_unlock(object);
2348
2349                                 /*
2350                                  * Write the page to the copy-object,
2351                                  * flushing it from the kernel.
2352                                  */
2353                                 vm_pageout_initialize_page(copy_m);
2354
2355                                 /*
2356                                  * Since the pageout may have
2357                                  * temporarily dropped the
2358                                  * copy_object's lock, we
2359                                  * check whether we'll have
2360                                  * to deallocate the hard way.
2361                                  */
2362                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2363                                         vm_object_unlock(copy_object);
2364                                         vm_object_deallocate(copy_object);
2365                                         vm_object_lock(object);
2366
2367                                         continue;
2368                                 }
2369                                 /*
2370                                  * Pick back up the old object's
2371                                  * lock.  [It is safe to do so,
2372                                  * since it must be deeper in the
2373                                  * object tree.]
2374                                  */
2375                                 vm_object_lock(object);
2376                         }
2377
2378                         /*
2379                          * Because we're pushing a page upward
2380                          * in the object tree, we must restart
2381                          * any faults that are waiting here.
2382                          * [Note that this is an expansion of
2383                          * PAGE_WAKEUP that uses the THREAD_RESTART
2384                          * wait result].  Can't turn off the page's
2385                          * busy bit because we're not done with it.
2386                          */
2387                         if (m->wanted) {
2388                                 m->wanted = FALSE;
2389                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2390                         }
2391                 }
2392                 /*
2393                  * The reference count on copy_object must be
2394                  * at least 2: one for our extra reference,
2395                  * and at least one from the outside world
2396                  * (we checked that when we last locked
2397                  * copy_object).
2398                  */
2399                 vm_object_lock_assert_exclusive(copy_object);
2400                 copy_object->ref_count--;
2401                 assert(copy_object->ref_count > 0);
2402
2403                 VM_OBJ_RES_DECR(copy_object);
2404                 vm_object_unlock(copy_object);
2405
2406                 break;
2407         }
2408
2409 done:
2410         *result_page = m;
2411         *top_page = first_m;
2412
2413         XPR(XPR_VM_FAULT,
2414                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2415                 object, offset, m, first_m, 0);
2416
2417         if (m != VM_PAGE_NULL) {
2418                 assert(VM_PAGE_OBJECT(m) == object);
2419
2420                 retval = VM_FAULT_SUCCESS;
2421
2422                 if (my_fault == DBG_PAGEIN_FAULT) {
2423
2424                         VM_PAGE_COUNT_AS_PAGEIN(m);
2425
2426                         if (object->internal)
2427                                 my_fault = DBG_PAGEIND_FAULT;
2428                         else
2429                                 my_fault = DBG_PAGEINV_FAULT;
2430
2431                         /*
2432                          * evaluate access pattern and update state
2433                          * vm_fault_deactivate_behind depends on the
2434                          * state being up to date
2435                          */
2436                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2437
2438                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2439                 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2440
2441                         VM_STAT_INCR(decompressions);
2442                 }
2443                 if (type_of_fault)
2444                         *type_of_fault = my_fault;
2445         } else {
2446                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2447                 assert(first_m == VM_PAGE_NULL);
2448                 assert(object == first_object);
2449         }
2450
2451         thread_interrupt_level(interruptible_state);
2452
2453 #if TRACEFAULTPAGE
2454         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2455 #endif
2456         return retval;
2457
2458 backoff:
2459         thread_interrupt_level(interruptible_state);
2460
2461         if (wait_result == THREAD_INTERRUPTED)
2462                 return (VM_FAULT_INTERRUPTED);
2463         return (VM_FAULT_RETRY);
2464
2465 #undef  RELEASE_PAGE
2466 }
2467
2468
2469
2470 /*
2471  * CODE SIGNING:
2472  * When soft faulting a page, we have to validate the page if:
2473  * 1. the page is being mapped in user space
2474  * 2. the page hasn't already been found to be "tainted"
2475  * 3. the page belongs to a code-signed object
2476  * 4. the page has not been validated yet or has been mapped for write.
2477  */
2478 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page, page_obj)               \
2479         ((pmap) != kernel_pmap /*1*/ &&                                 \
2480          !(page)->cs_tainted /*2*/ &&                                   \
2481          (page_obj)->code_signed /*3*/ &&                                       \
2482          (!(page)->cs_validated || (page)->wpmapped /*4*/))
2483
2484
2485 /*
2486  * page queue lock must NOT be held
2487  * m->object must be locked
2488  *
2489  * NOTE: m->object could be locked "shared" only if we are called
2490  * from vm_fault() as part of a soft fault.  If so, we must be
2491  * careful not to modify the VM object in any way that is not
2492  * legal under a shared lock...
2493  */
2494 extern int panic_on_cs_killed;
2495 extern int proc_selfpid(void);
2496 extern char *proc_name_address(void *p);
2497 unsigned long cs_enter_tainted_rejected = 0;
2498 unsigned long cs_enter_tainted_accepted = 0;
2499 kern_return_t
2500 vm_fault_enter(vm_page_t m,
2501                pmap_t pmap,
2502                vm_map_offset_t vaddr,
2503                vm_prot_t prot,
2504                vm_prot_t caller_prot,
2505                boolean_t wired,
2506                boolean_t change_wiring,
2507                vm_tag_t  wire_tag,
2508                boolean_t no_cache,
2509                boolean_t cs_bypass,
2510                __unused int      user_tag,
2511                int       pmap_options,
2512                boolean_t *need_retry,
2513                int *type_of_fault)
2514 {
2515         kern_return_t   kr, pe_result;
2516         boolean_t       previously_pmapped = m->pmapped;
2517         boolean_t       must_disconnect = 0;
2518         boolean_t       map_is_switched, map_is_switch_protected;
2519         int             cs_enforcement_enabled;
2520         vm_prot_t       fault_type;
2521         vm_object_t     object;
2522
2523         fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
2524         object = VM_PAGE_OBJECT(m);
2525
2526         vm_object_lock_assert_held(object);
2527
2528 #if KASAN
2529         if (pmap == kernel_pmap) {
2530                 kasan_notify_address(vaddr, PAGE_SIZE);
2531         }
2532 #endif
2533
2534         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2535
2536         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
2537                 assert(m->fictitious);
2538                 return KERN_SUCCESS;
2539         }
2540
2541         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2542
2543                 vm_object_lock_assert_exclusive(object);
2544
2545         } else if ((fault_type & VM_PROT_WRITE) == 0 && !m->wpmapped) {
2546                 /*
2547                  * This is not a "write" fault, so we
2548                  * might not have taken the object lock
2549                  * exclusively and we might not be able
2550                  * to update the "wpmapped" bit in
2551                  * vm_fault_enter().
2552                  * Let's just grant read access to
2553                  * the page for now and we'll
2554                  * soft-fault again if we need write
2555                  * access later...
2556                  */
2557
2558                 /* This had better not be a JIT page. */
2559                 if (!pmap_has_prot_policy(prot)) {
2560                         prot &= ~VM_PROT_WRITE;
2561                 } else {
2562                         assert(cs_bypass);
2563                 }
2564         }
2565         if (m->pmapped == FALSE) {
2566
2567                 if (m->clustered) {
2568                         if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
2569                                 /*
2570                                  * found it in the cache, but this
2571                                  * is the first fault-in of the page (m->pmapped == FALSE)
2572                                  * so it must have come in as part of
2573                                  * a cluster... account 1 pagein against it
2574                                  */
2575                                 if (object->internal)
2576                                         *type_of_fault = DBG_PAGEIND_FAULT;
2577                                 else
2578                                         *type_of_fault = DBG_PAGEINV_FAULT;
2579
2580                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2581                         }
2582                         VM_PAGE_CONSUME_CLUSTERED(m);
2583                 }
2584         }
2585
2586         if (*type_of_fault != DBG_COW_FAULT) {
2587                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2588
2589                 if (pmap == kernel_pmap) {
2590                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2591                 }
2592         }
2593
2594         /* Validate code signature if necessary. */
2595         if (VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) {
2596                 vm_object_lock_assert_exclusive(object);
2597
2598                 if (m->cs_validated) {
2599                         vm_cs_revalidates++;
2600                 }
2601
2602                 /* VM map is locked, so 1 ref will remain on VM object -
2603                  * so no harm if vm_page_validate_cs drops the object lock */
2604                 vm_page_validate_cs(m);
2605         }
2606
2607 #define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2608 #define page_nx(m) ((m)->cs_nx)
2609
2610         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2611                            (pmap == vm_map_pmap(current_thread()->map)));
2612         map_is_switch_protected = current_thread()->map->switch_protect;
2613
2614         /* If the map is switched, and is switch-protected, we must protect
2615          * some pages from being write-faulted: immutable pages because by
2616          * definition they may not be written, and executable pages because that
2617          * would provide a way to inject unsigned code.
2618          * If the page is immutable, we can simply return. However, we can't
2619          * immediately determine whether a page is executable anywhere. But,
2620          * we can disconnect it everywhere and remove the executable protection
2621          * from the current map. We do that below right before we do the
2622          * PMAP_ENTER.
2623          */
2624         cs_enforcement_enabled = cs_enforcement(NULL);
2625
2626         if(cs_enforcement_enabled && map_is_switched &&
2627            map_is_switch_protected && page_immutable(m, prot) &&
2628            (prot & VM_PROT_WRITE))
2629         {
2630                 return KERN_CODESIGN_ERROR;
2631         }
2632
2633         if (cs_enforcement_enabled && page_nx(m) && (prot & VM_PROT_EXECUTE)) {
2634                 if (cs_debug)
2635                         printf("page marked to be NX, not letting it be mapped EXEC\n");
2636                 return KERN_CODESIGN_ERROR;
2637         }
2638
2639         if (cs_enforcement_enabled &&
2640             !m->cs_validated &&
2641             (prot & VM_PROT_EXECUTE) &&
2642             !(caller_prot & VM_PROT_EXECUTE)) {
2643                 /*
2644                  * FOURK PAGER:
2645                  * This page has not been validated and will not be
2646                  * allowed to be mapped for "execute".
2647                  * But the caller did not request "execute" access for this
2648                  * fault, so we should not raise a code-signing violation
2649                  * (and possibly kill the process) below.
2650                  * Instead, let's just remove the "execute" access request.
2651                  *
2652                  * This can happen on devices with a 4K page size if a 16K
2653                  * page contains a mix of signed&executable and
2654                  * unsigned&non-executable 4K pages, making the whole 16K
2655                  * mapping "executable".
2656                  */
2657                 if (!pmap_has_prot_policy(prot)) {
2658                         prot &= ~VM_PROT_EXECUTE;
2659                 } else {
2660                         assert(cs_bypass);
2661                 }
2662         }
2663
2664         /* A page could be tainted, or pose a risk of being tainted later.
2665          * Check whether the receiving process wants it, and make it feel
2666          * the consequences (that hapens in cs_invalid_page()).
2667          * For CS Enforcement, two other conditions will
2668          * cause that page to be tainted as well:
2669          * - pmapping an unsigned page executable - this means unsigned code;
2670          * - writeable mapping of a validated page - the content of that page
2671          *   can be changed without the kernel noticing, therefore unsigned
2672          *   code can be created
2673          */
2674         if (!cs_bypass &&
2675             (m->cs_tainted ||
2676              (cs_enforcement_enabled &&
2677               (/* The page is unsigned and wants to be executable */
2678                (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2679                /* The page should be immutable, but is in danger of being modified
2680                 * This is the case where we want policy from the code directory -
2681                 * is the page immutable or not? For now we have to assume that
2682                 * code pages will be immutable, data pages not.
2683                 * We'll assume a page is a code page if it has a code directory
2684                 * and we fault for execution.
2685                 * That is good enough since if we faulted the code page for
2686                 * writing in another map before, it is wpmapped; if we fault
2687                 * it for writing in this map later it will also be faulted for executing
2688                 * at the same time; and if we fault for writing in another map
2689                 * later, we will disconnect it from this pmap so we'll notice
2690                 * the change.
2691                 */
2692               (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2693               ))
2694                     ))
2695         {
2696                 /* We will have a tainted page. Have to handle the special case
2697                  * of a switched map now. If the map is not switched, standard
2698                  * procedure applies - call cs_invalid_page().
2699                  * If the map is switched, the real owner is invalid already.
2700                  * There is no point in invalidating the switching process since
2701                  * it will not be executing from the map. So we don't call
2702                  * cs_invalid_page() in that case. */
2703                 boolean_t reject_page, cs_killed;
2704                 if(map_is_switched) {
2705                         assert(pmap==vm_map_pmap(current_thread()->map));
2706                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2707                         reject_page = FALSE;
2708                 } else {
2709                         if (cs_debug > 5)
2710                                 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s slid: %s prot: 0x%x\n",
2711                                        object->code_signed ? "yes" : "no",
2712                                        m->cs_validated ? "yes" : "no",
2713                                        m->cs_tainted ? "yes" : "no",
2714                                        m->wpmapped ? "yes" : "no",
2715                                        m->slid ? "yes" : "no",
2716                                        (int)prot);
2717                         reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2718                 }
2719
2720                 if (reject_page) {
2721                         /* reject the invalid page: abort the page fault */
2722                         int                     pid;
2723                         const char              *procname;
2724                         task_t                  task;
2725                         vm_object_t             file_object, shadow;
2726                         vm_object_offset_t      file_offset;
2727                         char                    *pathname, *filename;
2728                         vm_size_t               pathname_len, filename_len;
2729                         boolean_t               truncated_path;
2730 #define __PATH_MAX 1024
2731                         struct timespec         mtime, cs_mtime;
2732                         int                     shadow_depth;
2733                         os_reason_t             codesigning_exit_reason = OS_REASON_NULL;
2734
2735                         kr = KERN_CODESIGN_ERROR;
2736                         cs_enter_tainted_rejected++;
2737
2738                         /* get process name and pid */
2739                         procname = "?";
2740                         task = current_task();
2741                         pid = proc_selfpid();
2742                         if (task->bsd_info != NULL)
2743                                 procname = proc_name_address(task->bsd_info);
2744
2745                         /* get file's VM object */
2746                         file_object = object;
2747                         file_offset = m->offset;
2748                         for (shadow = file_object->shadow,
2749                                      shadow_depth = 0;
2750                              shadow != VM_OBJECT_NULL;
2751                              shadow = file_object->shadow,
2752                                 shadow_depth++) {
2753                                 vm_object_lock_shared(shadow);
2754                                 if (file_object != object) {
2755                                         vm_object_unlock(file_object);
2756                                 }
2757                                 file_offset += file_object->vo_shadow_offset;
2758                                 file_object = shadow;
2759                         }
2760
2761                         mtime.tv_sec = 0;
2762                         mtime.tv_nsec = 0;
2763                         cs_mtime.tv_sec = 0;
2764                         cs_mtime.tv_nsec = 0;
2765
2766                         /* get file's pathname and/or filename */
2767                         pathname = NULL;
2768                         filename = NULL;
2769                         pathname_len = 0;
2770                         filename_len = 0;
2771                         truncated_path = FALSE;
2772                         /* no pager -> no file -> no pathname, use "<nil>" in that case */
2773                         if (file_object->pager != NULL) {
2774                                 pathname = (char *)kalloc(__PATH_MAX * 2);
2775                                 if (pathname) {
2776                                         pathname[0] = '\0';
2777                                         pathname_len = __PATH_MAX;
2778                                         filename = pathname + pathname_len;
2779                                         filename_len = __PATH_MAX;
2780                                 }
2781                                 vnode_pager_get_object_name(file_object->pager,
2782                                                             pathname,
2783                                                             pathname_len,
2784                                                             filename,
2785                                                             filename_len,
2786                                                             &truncated_path);
2787                                 if (pathname) {
2788                                         /* safety first... */
2789                                         pathname[__PATH_MAX-1] = '\0';
2790                                         filename[__PATH_MAX-1] = '\0';
2791                                 }
2792                                 vnode_pager_get_object_mtime(file_object->pager,
2793                                                              &mtime,
2794                                                              &cs_mtime);
2795                         }
2796                         printf("CODE SIGNING: process %d[%s]: "
2797                                "rejecting invalid page at address 0x%llx "
2798                                "from offset 0x%llx in file \"%s%s%s\" "
2799                                "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2800                                "(signed:%d validated:%d tainted:%d nx:%d "
2801                                "wpmapped:%d slid:%d dirty:%d depth:%d)\n",
2802                                pid, procname, (addr64_t) vaddr,
2803                                file_offset,
2804                                (pathname ? pathname : "<nil>"),
2805                                (truncated_path ? "/.../" : ""),
2806                                (truncated_path ? filename : ""),
2807                                cs_mtime.tv_sec, cs_mtime.tv_nsec,
2808                                ((cs_mtime.tv_sec == mtime.tv_sec &&
2809                                  cs_mtime.tv_nsec == mtime.tv_nsec)
2810                                 ? "=="
2811                                 : "!="),
2812                                mtime.tv_sec, mtime.tv_nsec,
2813                                object->code_signed,
2814                                m->cs_validated,
2815                                m->cs_tainted,
2816                                m->cs_nx,
2817                                m->wpmapped,
2818                                m->slid,
2819                                m->dirty,
2820                                shadow_depth);
2821
2822                         /*
2823                          * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2824                          * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2825                          * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2826                          * will deal with the segmentation fault.
2827                          */
2828                         if (cs_killed) {
2829                                 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2830                                                                 pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
2831
2832                                 codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2833                                 if (codesigning_exit_reason == NULL) {
2834                                         printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2835                                 } else {
2836                                         mach_vm_address_t data_addr = 0;
2837                                         struct codesigning_exit_reason_info *ceri = NULL;
2838                                         uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2839
2840                                         if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2841                                                 printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2842                                         } else {
2843                                                 if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2844                                                                 EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2845                                                         ceri = (struct codesigning_exit_reason_info *)data_addr;
2846                                                         static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2847
2848                                                         ceri->ceri_virt_addr = vaddr;
2849                                                         ceri->ceri_file_offset = file_offset;
2850                                                         if (pathname)
2851                                                                 strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2852                                                         else
2853                                                                 ceri->ceri_pathname[0] = '\0';
2854                                                         if (filename)
2855                                                                 strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2856                                                         else
2857                                                                 ceri->ceri_filename[0] = '\0';
2858                                                         ceri->ceri_path_truncated = (truncated_path);
2859                                                         ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2860                                                         ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2861                                                         ceri->ceri_page_modtime_secs = mtime.tv_sec;
2862                                                         ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2863                                                         ceri->ceri_object_codesigned = (object->code_signed);
2864                                                         ceri->ceri_page_codesig_validated = (m->cs_validated);
2865                                                         ceri->ceri_page_codesig_tainted = (m->cs_tainted);
2866                                                         ceri->ceri_page_codesig_nx = (m->cs_nx);
2867                                                         ceri->ceri_page_wpmapped = (m->wpmapped);
2868                                                         ceri->ceri_page_slid = (m->slid);
2869                                                         ceri->ceri_page_dirty = (m->dirty);
2870                                                         ceri->ceri_page_shadow_depth = shadow_depth;
2871                                                 } else {
2872 #if DEBUG || DEVELOPMENT
2873                                                         panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2874 #else
2875                                                         printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2876 #endif /* DEBUG || DEVELOPMENT */
2877                                                         /* Free the buffer */
2878                                                         os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2879                                                 }
2880                                         }
2881                                 }
2882
2883                                 set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2884                         }
2885                         if (panic_on_cs_killed &&
2886                             object->object_slid) {
2887                                 panic("CODE SIGNING: process %d[%s]: "
2888                                       "rejecting invalid page at address 0x%llx "
2889                                       "from offset 0x%llx in file \"%s%s%s\" "
2890                                       "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2891                                       "(signed:%d validated:%d tainted:%d nx:%d"
2892                                       "wpmapped:%d slid:%d dirty:%d depth:%d)\n",
2893                                       pid, procname, (addr64_t) vaddr,
2894                                       file_offset,
2895                                       (pathname ? pathname : "<nil>"),
2896                                       (truncated_path ? "/.../" : ""),
2897                                       (truncated_path ? filename : ""),
2898                                       cs_mtime.tv_sec, cs_mtime.tv_nsec,
2899                                       ((cs_mtime.tv_sec == mtime.tv_sec &&
2900                                         cs_mtime.tv_nsec == mtime.tv_nsec)
2901                                        ? "=="
2902                                        : "!="),
2903                                       mtime.tv_sec, mtime.tv_nsec,
2904                                       object->code_signed,
2905                                       m->cs_validated,
2906                                       m->cs_tainted,
2907                                       m->cs_nx,
2908                                       m->wpmapped,
2909                                       m->slid,
2910                                       m->dirty,
2911                                       shadow_depth);
2912                         }
2913
2914                         if (file_object != object) {
2915                                 vm_object_unlock(file_object);
2916                         }
2917                         if (pathname_len != 0) {
2918                                 kfree(pathname, __PATH_MAX * 2);
2919                                 pathname = NULL;
2920                                 filename = NULL;
2921                         }
2922                 } else {
2923                         /* proceed with the invalid page */
2924                         kr = KERN_SUCCESS;
2925                         if (!m->cs_validated &&
2926                             !object->code_signed) {
2927                                 /*
2928                                  * This page has not been (fully) validated but
2929                                  * does not belong to a code-signed object
2930                                  * so it should not be forcefully considered
2931                                  * as tainted.
2932                                  * We're just concerned about it here because
2933                                  * we've been asked to "execute" it but that
2934                                  * does not mean that it should cause other
2935                                  * accesses to fail.
2936                                  * This happens when a debugger sets a
2937                                  * breakpoint and we then execute code in
2938                                  * that page.  Marking the page as "tainted"
2939                                  * would cause any inspection tool ("leaks",
2940                                  * "vmmap", "CrashReporter", ...) to get killed
2941                                  * due to code-signing violation on that page,
2942                                  * even though they're just reading it and not
2943                                  * executing from it.
2944                                  */
2945                         } else {
2946                                 /*
2947                                  * Page might have been tainted before or not;
2948                                  * now it definitively is. If the page wasn't
2949                                  * tainted, we must disconnect it from all
2950                                  * pmaps later, to force existing mappings
2951                                  * through that code path for re-consideration
2952                                  * of the validity of that page.
2953                                  */
2954                                 must_disconnect = !m->cs_tainted;
2955                                 m->cs_tainted = TRUE;
2956                         }
2957                         cs_enter_tainted_accepted++;
2958                 }
2959                 if (kr != KERN_SUCCESS) {
2960                         if (cs_debug) {
2961                                 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2962                                        "*** INVALID PAGE ***\n",
2963                                        (long long)vaddr);
2964                         }
2965 #if !SECURE_KERNEL
2966                         if (cs_enforcement_panic) {
2967                                 panic("CODESIGNING: panicking on invalid page\n");
2968                         }
2969 #endif
2970                 }
2971
2972         } else {
2973                 /* proceed with the valid page */
2974                 kr = KERN_SUCCESS;
2975         }
2976
2977         boolean_t       page_queues_locked = FALSE;
2978 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
2979 MACRO_BEGIN                                     \
2980         if (! page_queues_locked) {             \
2981                 page_queues_locked = TRUE;      \
2982                 vm_page_lockspin_queues();      \
2983         }                                       \
2984 MACRO_END
2985 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
2986 MACRO_BEGIN                                     \
2987         if (page_queues_locked) {               \
2988                 page_queues_locked = FALSE;     \
2989                 vm_page_unlock_queues();        \
2990         }                                       \
2991 MACRO_END
2992
2993         /*
2994          * Hold queues lock to manipulate
2995          * the page queues.  Change wiring
2996          * case is obvious.
2997          */
2998         assert((m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
2999
3000 #if CONFIG_BACKGROUND_QUEUE
3001         vm_page_update_background_state(m);
3002 #endif
3003         if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3004                 /*
3005                  * Compressor pages are neither wired
3006                  * nor pageable and should never change.
3007                  */
3008                 assert(object == compressor_object);
3009         } else if (change_wiring) {
3010                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3011
3012                 if (wired) {
3013                         if (kr == KERN_SUCCESS) {
3014                                 vm_page_wire(m, wire_tag, TRUE);
3015                         }
3016                 } else {
3017                         vm_page_unwire(m, TRUE);
3018                 }
3019                 /* we keep the page queues lock, if we need it later */
3020
3021         } else {
3022                 if (object->internal == TRUE) {
3023                         /*
3024                          * don't allow anonymous pages on
3025                          * the speculative queues
3026                          */
3027                         no_cache = FALSE;
3028                 }
3029                 if (kr != KERN_SUCCESS) {
3030                         __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3031                         vm_page_deactivate(m);
3032                         /* we keep the page queues lock, if we need it later */
3033                 } else if (((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) ||
3034                             (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3035                             (m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3036                             ((m->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3037                            !VM_PAGE_WIRED(m)) {
3038
3039                         if (vm_page_local_q &&
3040                             (*type_of_fault == DBG_COW_FAULT ||
3041                              *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
3042                                 struct vpl      *lq;
3043                                 uint32_t        lid;
3044
3045                                 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3046
3047                                 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3048                                 vm_object_lock_assert_exclusive(object);
3049
3050                                 /*
3051                                  * we got a local queue to stuff this
3052                                  * new page on...
3053                                  * its safe to manipulate local and
3054                                  * local_id at this point since we're
3055                                  * behind an exclusive object lock and
3056                                  * the page is not on any global queue.
3057                                  *
3058                                  * we'll use the current cpu number to
3059                                  * select the queue note that we don't
3060                                  * need to disable preemption... we're
3061                                  * going to be behind the local queue's
3062                                  * lock to do the real work
3063                                  */
3064                                 lid = cpu_number();
3065
3066                                 lq = &vm_page_local_q[lid].vpl_un.vpl;
3067
3068                                 VPL_LOCK(&lq->vpl_lock);
3069
3070                                 vm_page_check_pageable_safe(m);
3071                                 vm_page_queue_enter(&lq->vpl_queue, m,
3072                                                     vm_page_t, pageq);
3073                                 m->vm_page_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3074                                 m->local_id = lid;
3075                                 lq->vpl_count++;
3076
3077                                 if (object->internal)
3078                                         lq->vpl_internal_count++;
3079                                 else
3080                                         lq->vpl_external_count++;
3081
3082                                 VPL_UNLOCK(&lq->vpl_lock);
3083
3084                                 if (lq->vpl_count > vm_page_local_q_soft_limit)
3085                                 {
3086                                         /*
3087                                          * we're beyond the soft limit
3088                                          * for the local queue
3089                                          * vm_page_reactivate_local will
3090                                          * 'try' to take the global page
3091                                          * queue lock... if it can't
3092                                          * that's ok... we'll let the
3093                                          * queue continue to grow up
3094                                          * to the hard limit... at that
3095                                          * point we'll wait for the
3096                                          * lock... once we've got the
3097                                          * lock, we'll transfer all of
3098                                          * the pages from the local
3099                                          * queue to the global active
3100                                          * queue
3101                                          */
3102                                         vm_page_reactivate_local(lid, FALSE, FALSE);
3103                                 }
3104                         } else {
3105
3106                                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3107
3108                                 /*
3109                                  * test again now that we hold the
3110                                  * page queue lock
3111                                  */
3112                                 if (!VM_PAGE_WIRED(m)) {
3113                                         if (m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3114                                                 vm_page_queues_remove(m, FALSE);
3115
3116                                                 vm_pageout_cleaned_reactivated++;
3117                                                 vm_pageout_cleaned_fault_reactivated++;
3118                                         }
3119
3120                                         if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3121                                              no_cache) {
3122                                                 /*
3123                                                  * If this is a no_cache mapping
3124                                                  * and the page has never been
3125                                                  * mapped before or was
3126                                                  * previously a no_cache page,
3127                                                  * then we want to leave pages
3128                                                  * in the speculative state so
3129                                                  * that they can be readily
3130                                                  * recycled if free memory runs
3131                                                  * low.  Otherwise the page is
3132                                                  * activated as normal.
3133                                                  */
3134
3135                                                 if (no_cache &&
3136                                                     (!previously_pmapped ||
3137                                                      m->no_cache)) {
3138                                                         m->no_cache = TRUE;
3139
3140                                                         if (m->vm_page_q_state != VM_PAGE_ON_SPECULATIVE_Q)
3141                                                                 vm_page_speculate(m, FALSE);
3142
3143                                                 } else if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3144                                                         vm_page_activate(m);
3145                                                 }
3146                                         }
3147                                 }
3148                                 /* we keep the page queues lock, if we need it later */
3149                         }
3150                 }
3151         }
3152         /* we're done with the page queues lock, if we ever took it */
3153         __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3154
3155
3156         /* If we have a KERN_SUCCESS from the previous checks, we either have
3157          * a good page, or a tainted page that has been accepted by the process.
3158          * In both cases the page will be entered into the pmap.
3159          * If the page is writeable, we need to disconnect it from other pmaps
3160          * now so those processes can take note.
3161          */
3162         if (kr == KERN_SUCCESS) {
3163                 /*
3164                  * NOTE: we may only hold the vm_object lock SHARED
3165                  * at this point, so we need the phys_page lock to
3166                  * properly serialize updating the pmapped and
3167                  * xpmapped bits
3168                  */
3169                 if ((prot & VM_PROT_EXECUTE) && !m->xpmapped) {
3170                         ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3171
3172                         pmap_lock_phys_page(phys_page);
3173                         /*
3174                          * go ahead and take the opportunity
3175                          * to set 'pmapped' here so that we don't
3176                          * need to grab this lock a 2nd time
3177                          * just below
3178                          */
3179                         m->pmapped = TRUE;
3180
3181                         if (!m->xpmapped) {
3182
3183                                 m->xpmapped = TRUE;
3184
3185                                 pmap_unlock_phys_page(phys_page);
3186
3187                                 if (!object->internal)
3188                                         OSAddAtomic(1, &vm_page_xpmapped_external_count);
3189
3190 #if defined(__arm__) || defined(__arm64__)
3191                                 pmap_sync_page_data_phys(phys_page);
3192 #else
3193                                 if (object->internal &&
3194                                     object->pager != NULL) {
3195                                         /*
3196                                          * This page could have been
3197                                          * uncompressed by the
3198                                          * compressor pager and its
3199                                          * contents might be only in
3200                                          * the data cache.
3201                                          * Since it's being mapped for
3202                                          * "execute" for the fist time,
3203                                          * make sure the icache is in
3204                                          * sync.
3205                                          */
3206                                         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3207                                         pmap_sync_page_data_phys(phys_page);
3208                                 }
3209 #endif
3210                         } else
3211                                 pmap_unlock_phys_page(phys_page);
3212                 } else {
3213                         if (m->pmapped == FALSE) {
3214                                 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3215
3216                                 pmap_lock_phys_page(phys_page);
3217                                 m->pmapped = TRUE;
3218                                 pmap_unlock_phys_page(phys_page);
3219                         }
3220                 }
3221                 if (vm_page_is_slideable(m)) {
3222                         boolean_t was_busy = m->busy;
3223
3224                         vm_object_lock_assert_exclusive(object);
3225
3226                         m->busy = TRUE;
3227                         kr = vm_page_slide(m, 0);
3228                         assert(m->busy);
3229                         if(!was_busy) {
3230                                 PAGE_WAKEUP_DONE(m);
3231                         }
3232                         if (kr != KERN_SUCCESS) {
3233                                 /*
3234                                  * This page has not been slid correctly,
3235                                  * do not do the pmap_enter() !
3236                                  * Let vm_fault_enter() return the error
3237                                  * so the caller can fail the fault.
3238                                  */
3239                                 goto after_the_pmap_enter;
3240                         }
3241                 }
3242
3243                 if (fault_type & VM_PROT_WRITE) {
3244
3245                         if (m->wpmapped == FALSE) {
3246                                 vm_object_lock_assert_exclusive(object);
3247                                 if (!object->internal && object->pager) {
3248                                         task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3249                                 }
3250                                 m->wpmapped = TRUE;
3251                         }
3252                         if (must_disconnect) {
3253                                 /*
3254                                  * We can only get here
3255                                  * because of the CSE logic
3256                                  */
3257                                 assert(cs_enforcement_enabled);
3258                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3259                                 /*
3260                                  * If we are faulting for a write, we can clear
3261                                  * the execute bit - that will ensure the page is
3262                                  * checked again before being executable, which
3263                                  * protects against a map switch.
3264                                  * This only happens the first time the page
3265                                  * gets tainted, so we won't get stuck here
3266                                  * to make an already writeable page executable.
3267                                  */
3268                                 if (!cs_bypass){
3269                                         assert(!pmap_has_prot_policy(prot));
3270                                         prot &= ~VM_PROT_EXECUTE;
3271                                 }
3272                         }
3273                 }
3274                 assert(VM_PAGE_OBJECT(m) == object);
3275
3276                 /* Prevent a deadlock by not
3277                  * holding the object lock if we need to wait for a page in
3278                  * pmap_enter() - <rdar://problem/7138958> */
3279                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3280                                    wired,
3281                                    pmap_options | PMAP_OPTIONS_NOWAIT,
3282                                    pe_result);
3283 #if __x86_64__
3284                 if (pe_result == KERN_INVALID_ARGUMENT &&
3285                     pmap == PMAP_NULL &&
3286                     wired) {
3287                         /*
3288                          * Wiring a page in a pmap-less VM map:
3289                          * VMware's "vmmon" kernel extension does this
3290                          * to grab pages.
3291                          * Let it proceed even though the PMAP_ENTER() failed.
3292                          */
3293                         pe_result = KERN_SUCCESS;
3294                 }
3295 #endif /* __x86_64__ */
3296
3297                 if(pe_result == KERN_RESOURCE_SHORTAGE) {
3298
3299                         if (need_retry) {
3300                                 /*
3301                                  * this will be non-null in the case where we hold the lock
3302                                  * on the top-object in this chain... we can't just drop
3303                                  * the lock on the object we're inserting the page into
3304                                  * and recall the PMAP_ENTER since we can still cause
3305                                  * a deadlock if one of the critical paths tries to
3306                                  * acquire the lock on the top-object and we're blocked
3307                                  * in PMAP_ENTER waiting for memory... our only recourse
3308                                  * is to deal with it at a higher level where we can
3309                                  * drop both locks.
3310                                  */
3311                                 *need_retry = TRUE;
3312                                 vm_pmap_enter_retried++;
3313                                 goto after_the_pmap_enter;
3314                         }
3315                         /* The nonblocking version of pmap_enter did not succeed.
3316                          * and we don't need to drop other locks and retry
3317                          * at the level above us, so
3318                          * use the blocking version instead. Requires marking
3319                          * the page busy and unlocking the object */
3320                         boolean_t was_busy = m->busy;
3321
3322                         vm_object_lock_assert_exclusive(object);
3323
3324                         m->busy = TRUE;
3325                         vm_object_unlock(object);
3326
3327                         PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
3328                                            0, wired,
3329                                            pmap_options, pe_result);
3330
3331                         assert(VM_PAGE_OBJECT(m) == object);
3332
3333                         /* Take the object lock again. */
3334                         vm_object_lock(object);
3335
3336                         /* If the page was busy, someone else will wake it up.
3337                          * Otherwise, we have to do it now. */
3338                         assert(m->busy);
3339                         if(!was_busy) {
3340                                 PAGE_WAKEUP_DONE(m);
3341                         }
3342                         vm_pmap_enter_blocked++;
3343                 }
3344
3345                 kr = pe_result;
3346         }
3347
3348 after_the_pmap_enter:
3349         return kr;
3350 }
3351
3352 void
3353 vm_pre_fault(vm_map_offset_t vaddr)
3354 {
3355         if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3356
3357                 vm_fault(current_map(),      /* map */
3358                         vaddr,               /* vaddr */
3359                         VM_PROT_READ,        /* fault_type */
3360                         FALSE,               /* change_wiring */
3361                         VM_KERN_MEMORY_NONE, /* tag - not wiring */
3362                         THREAD_UNINT,        /* interruptible */
3363                         NULL,                /* caller_pmap */
3364                         0                    /* caller_pmap_addr */);
3365         }
3366 }
3367
3368
3369 /*
3370  *      Routine:        vm_fault
3371  *      Purpose:
3372  *              Handle page faults, including pseudo-faults
3373  *              used to change the wiring status of pages.
3374  *      Returns:
3375  *              Explicit continuations have been removed.
3376  *      Implementation:
3377  *              vm_fault and vm_fault_page save mucho state
3378  *              in the moral equivalent of a closure.  The state
3379  *              structure is allocated when first entering vm_fault
3380  *              and deallocated when leaving vm_fault.
3381  */
3382
3383 extern int _map_enter_debug;
3384 extern uint64_t get_current_unique_pid(void);
3385
3386 unsigned long vm_fault_collapse_total = 0;
3387 unsigned long vm_fault_collapse_skipped = 0;
3388
3389
3390 kern_return_t
3391 vm_fault_external(
3392         vm_map_t        map,
3393         vm_map_offset_t vaddr,
3394         vm_prot_t       fault_type,
3395         boolean_t       change_wiring,
3396         int             interruptible,
3397         pmap_t          caller_pmap,
3398         vm_map_offset_t caller_pmap_addr)
3399 {
3400         return vm_fault_internal(map, vaddr, fault_type, change_wiring, vm_tag_bt(),
3401                                  interruptible, caller_pmap, caller_pmap_addr,
3402                                  NULL);
3403 }
3404
3405 kern_return_t
3406 vm_fault(
3407         vm_map_t        map,
3408         vm_map_offset_t vaddr,
3409         vm_prot_t       fault_type,
3410         boolean_t       change_wiring,
3411         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3412         int             interruptible,
3413         pmap_t          caller_pmap,
3414         vm_map_offset_t caller_pmap_addr)
3415 {
3416         return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3417                                  interruptible, caller_pmap, caller_pmap_addr,
3418                                  NULL);
3419 }
3420
3421 kern_return_t
3422 vm_fault_internal(
3423         vm_map_t        map,
3424         vm_map_offset_t vaddr,
3425         vm_prot_t       caller_prot,
3426         boolean_t       change_wiring,
3427         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3428         int             interruptible,
3429         pmap_t          caller_pmap,
3430         vm_map_offset_t caller_pmap_addr,
3431         ppnum_t         *physpage_p)
3432 {
3433         vm_map_version_t        version;        /* Map version for verificiation */
3434         boolean_t               wired;          /* Should mapping be wired down? */
3435         vm_object_t             object;         /* Top-level object */
3436         vm_object_offset_t      offset;         /* Top-level offset */
3437         vm_prot_t               prot;           /* Protection for mapping */
3438         vm_object_t             old_copy_object; /* Saved copy object */
3439         vm_page_t               result_page;    /* Result of vm_fault_page */
3440         vm_page_t               top_page;       /* Placeholder page */
3441         kern_return_t           kr;
3442
3443         vm_page_t               m;      /* Fast access to result_page */
3444         kern_return_t           error_code;
3445         vm_object_t             cur_object;
3446         vm_object_t             m_object = NULL;
3447         vm_object_offset_t      cur_offset;
3448         vm_page_t               cur_m;
3449         vm_object_t             new_object;
3450         int                     type_of_fault;
3451         pmap_t                  pmap;
3452         boolean_t               interruptible_state;
3453         vm_map_t                real_map = map;
3454         vm_map_t                original_map = map;
3455         boolean_t               object_locks_dropped = FALSE;
3456         vm_prot_t               fault_type;
3457         vm_prot_t               original_fault_type;
3458         struct vm_object_fault_info fault_info;
3459         boolean_t               need_collapse = FALSE;
3460         boolean_t               need_retry = FALSE;
3461         boolean_t               *need_retry_ptr = NULL;
3462         int                     object_lock_type = 0;
3463         int                     cur_object_lock_type;
3464         vm_object_t             top_object = VM_OBJECT_NULL;
3465         int                     throttle_delay;
3466         int                     compressed_count_delta;
3467         int                     grab_options;
3468         vm_map_offset_t         trace_vaddr;
3469         vm_map_offset_t         trace_real_vaddr;
3470 #if DEVELOPMENT || DEBUG
3471         vm_map_offset_t         real_vaddr;
3472
3473         real_vaddr = vaddr;
3474 #endif /* DEVELOPMENT || DEBUG */
3475         trace_real_vaddr = vaddr;
3476         vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
3477
3478         if (map == kernel_map) {
3479                 trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
3480                 trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
3481         } else {
3482                 trace_vaddr = vaddr;
3483         }
3484
3485         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3486                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3487                               ((uint64_t)trace_vaddr >> 32),
3488                               trace_vaddr,
3489                               (map == kernel_map),
3490                               0,
3491                               0);
3492
3493         if (get_preemption_level() != 0) {
3494                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3495                                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3496                                       ((uint64_t)trace_vaddr >> 32),
3497                                       trace_vaddr,
3498                                       KERN_FAILURE,
3499                                       0,
3500                                       0);
3501
3502                 return (KERN_FAILURE);
3503         }
3504
3505         interruptible_state = thread_interrupt_level(interruptible);
3506
3507         fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
3508
3509         VM_STAT_INCR(faults);
3510         current_task()->faults++;
3511         original_fault_type = fault_type;
3512
3513         if (fault_type & VM_PROT_WRITE)
3514                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3515         else
3516                 object_lock_type = OBJECT_LOCK_SHARED;
3517
3518         cur_object_lock_type = OBJECT_LOCK_SHARED;
3519
3520         if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
3521                 if (compressor_map) {
3522                         if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
3523                                 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
3524
3525                         }
3526                 }
3527         }
3528 RetryFault:
3529         /*
3530          * assume we will hit a page in the cache
3531          * otherwise, explicitly override with
3532          * the real fault type once we determine it
3533          */
3534         type_of_fault = DBG_CACHE_HIT_FAULT;
3535
3536         /*
3537          *      Find the backing store object and offset into
3538          *      it to begin the search.
3539          */
3540         fault_type = original_fault_type;
3541         map = original_map;
3542         vm_map_lock_read(map);
3543
3544         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
3545                                   object_lock_type, &version,
3546                                   &object, &offset, &prot, &wired,
3547                                   &fault_info,
3548                                   &real_map);
3549
3550         if (kr != KERN_SUCCESS) {
3551                 vm_map_unlock_read(map);
3552                 goto done;
3553         }
3554         pmap = real_map->pmap;
3555         fault_info.interruptible = interruptible;
3556         fault_info.stealth = FALSE;
3557         fault_info.io_sync = FALSE;
3558         fault_info.mark_zf_absent = FALSE;
3559         fault_info.batch_pmap_op = FALSE;
3560
3561         /*
3562          * If the page is wired, we must fault for the current protection
3563          * value, to avoid further faults.
3564          */
3565         if (wired) {
3566                 fault_type = prot | VM_PROT_WRITE;
3567                 /*
3568                  * since we're treating this fault as a 'write'
3569                  * we must hold the top object lock exclusively
3570                  */
3571                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3572
3573                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3574
3575                         if (vm_object_lock_upgrade(object) == FALSE) {
3576                                 /*
3577                                  * couldn't upgrade, so explictly
3578                                  * take the lock exclusively
3579                                  */
3580                                 vm_object_lock(object);
3581                         }
3582                 }
3583         }
3584
3585 #if     VM_FAULT_CLASSIFY
3586         /*
3587          *      Temporary data gathering code
3588          */
3589         vm_fault_classify(object, offset, fault_type);
3590 #endif
3591         /*
3592          *      Fast fault code.  The basic idea is to do as much as
3593          *      possible while holding the map lock and object locks.
3594          *      Busy pages are not used until the object lock has to
3595          *      be dropped to do something (copy, zero fill, pmap enter).
3596          *      Similarly, paging references aren't acquired until that
3597          *      point, and object references aren't used.
3598          *
3599          *      If we can figure out what to do
3600          *      (zero fill, copy on write, pmap enter) while holding
3601          *      the locks, then it gets done.  Otherwise, we give up,
3602          *      and use the original fault path (which doesn't hold
3603          *      the map lock, and relies on busy pages).
3604          *      The give up cases include:
3605          *              - Have to talk to pager.
3606          *              - Page is busy, absent or in error.
3607          *              - Pager has locked out desired access.
3608          *              - Fault needs to be restarted.
3609          *              - Have to push page into copy object.
3610          *
3611          *      The code is an infinite loop that moves one level down
3612          *      the shadow chain each time.  cur_object and cur_offset
3613          *      refer to the current object being examined. object and offset
3614          *      are the original object from the map.  The loop is at the
3615          *      top level if and only if object and cur_object are the same.
3616          *
3617          *      Invariants:  Map lock is held throughout.  Lock is held on
3618          *              original object and cur_object (if different) when
3619          *              continuing or exiting loop.
3620          *
3621          */
3622
3623 #if defined(__arm64__)
3624         /*
3625          * Fail if reading an execute-only page in a
3626          * pmap that enforces execute-only protection.
3627          */
3628         if (fault_type == VM_PROT_READ &&
3629                 (prot & VM_PROT_EXECUTE) &&
3630                 !(prot & VM_PROT_READ) &&
3631                 pmap_enforces_execute_only(pmap)) {
3632                         vm_object_unlock(object);
3633                         vm_map_unlock_read(map);
3634                         if (real_map != map) {
3635                                 vm_map_unlock(real_map);
3636                         }
3637                         kr = KERN_PROTECTION_FAILURE;
3638                         goto done;
3639         }
3640 #endif
3641
3642         /*
3643          * If this page is to be inserted in a copy delay object
3644          * for writing, and if the object has a copy, then the
3645          * copy delay strategy is implemented in the slow fault page.
3646          */
3647         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3648             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
3649                 goto handle_copy_delay;
3650
3651         cur_object = object;
3652         cur_offset = offset;
3653
3654         grab_options = 0;
3655 #if CONFIG_SECLUDED_MEMORY
3656         if (object->can_grab_secluded) {
3657                 grab_options |= VM_PAGE_GRAB_SECLUDED;
3658         }
3659 #endif /* CONFIG_SECLUDED_MEMORY */
3660
3661         while (TRUE) {
3662                 if (!cur_object->pager_created &&
3663                     cur_object->phys_contiguous) /* superpage */
3664                         break;
3665
3666                 if (cur_object->blocked_access) {
3667                         /*
3668                          * Access to this VM object has been blocked.
3669                          * Let the slow path handle it.
3670                          */
3671                         break;
3672                 }
3673
3674                 m = vm_page_lookup(cur_object, cur_offset);
3675                 m_object = NULL;
3676
3677                 if (m != VM_PAGE_NULL) {
3678                         m_object = cur_object;
3679
3680                         if (m->busy) {
3681                                 wait_result_t   result;
3682
3683                                 /*
3684                                  * in order to do the PAGE_ASSERT_WAIT, we must
3685                                  * have object that 'm' belongs to locked exclusively
3686                                  */
3687                                 if (object != cur_object) {
3688
3689                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3690
3691                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3692
3693                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3694                                                         /*
3695                                                          * couldn't upgrade so go do a full retry
3696                                                          * immediately since we can no longer be
3697                                                          * certain about cur_object (since we
3698                                                          * don't hold a reference on it)...
3699                                                          * first drop the top object lock
3700                                                          */
3701                                                         vm_object_unlock(object);
3702
3703                                                         vm_map_unlock_read(map);
3704                                                         if (real_map != map)
3705                                                                 vm_map_unlock(real_map);
3706
3707                                                         goto RetryFault;
3708                                                 }
3709                                         }
3710                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3711
3712                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3713
3714                                         if (vm_object_lock_upgrade(object) == FALSE) {
3715                                                 /*
3716                                                  * couldn't upgrade, so explictly take the lock
3717                                                  * exclusively and go relookup the page since we
3718                                                  * will have dropped the object lock and
3719                                                  * a different thread could have inserted
3720                                                  * a page at this offset
3721                                                  * no need for a full retry since we're
3722                                                  * at the top level of the object chain
3723                                                  */
3724                                                 vm_object_lock(object);
3725
3726                                                 continue;
3727                                         }
3728                                 }
3729                                 if ((m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
3730                                         /*
3731                                          * m->busy == TRUE and the object is locked exclusively
3732                                          * if m->pageout_queue == TRUE after we acquire the
3733                                          * queues lock, we are guaranteed that it is stable on
3734                                          * the pageout queue and therefore reclaimable
3735                                          *
3736                                          * NOTE: this is only true for the internal pageout queue
3737                                          * in the compressor world
3738                                          */
3739                                         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3740
3741                                         vm_page_lock_queues();
3742
3743                                         if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
3744                                                 vm_pageout_throttle_up(m);
3745                                                 vm_page_unlock_queues();
3746
3747                                                 PAGE_WAKEUP_DONE(m);
3748                                                 goto reclaimed_from_pageout;
3749                                         }
3750                                         vm_page_unlock_queues();
3751                                 }
3752                                 if (object != cur_object)
3753                                         vm_object_unlock(object);
3754
3755                                 vm_map_unlock_read(map);
3756                                 if (real_map != map)
3757                                         vm_map_unlock(real_map);
3758
3759                                 result = PAGE_ASSERT_WAIT(m, interruptible);
3760
3761                                 vm_object_unlock(cur_object);
3762
3763                                 if (result == THREAD_WAITING) {
3764                                         result = thread_block(THREAD_CONTINUE_NULL);
3765
3766                                         counter(c_vm_fault_page_block_busy_kernel++);
3767                                 }
3768                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
3769                                         goto RetryFault;
3770
3771                                 kr = KERN_ABORTED;
3772                                 goto done;
3773                         }
3774 reclaimed_from_pageout:
3775                         if (m->laundry) {
3776                                 if (object != cur_object) {
3777                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3778                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3779
3780                                                 vm_object_unlock(object);
3781                                                 vm_object_unlock(cur_object);
3782
3783                                                 vm_map_unlock_read(map);
3784                                                 if (real_map != map)
3785                                                         vm_map_unlock(real_map);
3786
3787                                                 goto RetryFault;
3788                                         }
3789
3790                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3791
3792                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3793
3794                                         if (vm_object_lock_upgrade(object) == FALSE) {
3795                                                 /*
3796                                                  * couldn't upgrade, so explictly take the lock
3797                                                  * exclusively and go relookup the page since we
3798                                                  * will have dropped the object lock and
3799                                                  * a different thread could have inserted
3800                                                  * a page at this offset
3801                                                  * no need for a full retry since we're
3802                                                  * at the top level of the object chain
3803                                                  */
3804                                                 vm_object_lock(object);
3805
3806                                                 continue;
3807                                         }
3808                                 }
3809                                 vm_pageout_steal_laundry(m, FALSE);
3810                         }
3811
3812                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3813                                 /*
3814                                  * Guard page: let the slow path deal with it
3815                                  */
3816                                 break;
3817                         }
3818                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
3819                                 /*
3820                                  * Unusual case... let the slow path deal with it
3821                                  */
3822                                 break;
3823                         }
3824                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
3825                                 if (object != cur_object)
3826                                         vm_object_unlock(object);
3827                                 vm_map_unlock_read(map);
3828                                 if (real_map != map)
3829                                         vm_map_unlock(real_map);
3830                                 vm_object_unlock(cur_object);
3831                                 kr = KERN_MEMORY_ERROR;
3832                                 goto done;
3833                         }
3834                         if (vm_page_is_slideable(m)) {
3835                                 /*
3836                                  * We might need to slide this page, and so,
3837                                  * we want to hold the VM object exclusively.
3838                                  */
3839                                 if (object != cur_object) {
3840                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3841                                                 vm_object_unlock(object);
3842                                                 vm_object_unlock(cur_object);
3843
3844                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3845
3846                                                 vm_map_unlock_read(map);
3847                                                 if (real_map != map)
3848                                                         vm_map_unlock(real_map);
3849
3850                                                 goto RetryFault;
3851                                         }
3852                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3853
3854                                         vm_object_unlock(object);
3855                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3856                                         vm_map_unlock_read(map);
3857                                         goto RetryFault;
3858                                 }
3859                         }
3860                         assert(m_object == VM_PAGE_OBJECT(m));
3861
3862                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) ||
3863                             (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
3864 upgrade_for_validation:
3865                                 /*
3866                                  * We might need to validate this page
3867                                  * against its code signature, so we
3868                                  * want to hold the VM object exclusively.
3869                                  */
3870                                 if (object != cur_object) {
3871                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3872                                                 vm_object_unlock(object);
3873                                                 vm_object_unlock(cur_object);
3874
3875                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3876
3877                                                 vm_map_unlock_read(map);
3878                                                 if (real_map != map)
3879                                                         vm_map_unlock(real_map);
3880
3881                                                 goto RetryFault;
3882                                         }
3883
3884                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3885
3886                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3887
3888                                         if (vm_object_lock_upgrade(object) == FALSE) {
3889                                                 /*
3890                                                  * couldn't upgrade, so explictly take the lock
3891                                                  * exclusively and go relookup the page since we
3892                                                  * will have dropped the object lock and
3893                                                  * a different thread could have inserted
3894                                                  * a page at this offset
3895                                                  * no need for a full retry since we're
3896                                                  * at the top level of the object chain
3897                                                  */
3898                                                 vm_object_lock(object);
3899
3900                                                 continue;
3901                                         }
3902                                 }
3903                         }
3904                         /*
3905                          *      Two cases of map in faults:
3906                          *          - At top level w/o copy object.
3907                          *          - Read fault anywhere.
3908                          *              --> must disallow write.
3909                          */
3910
3911                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3912
3913                                 goto FastPmapEnter;
3914                         }
3915
3916                         if ((fault_type & VM_PROT_WRITE) == 0) {
3917                                 if (!pmap_has_prot_policy(prot)) {
3918                                         prot &= ~VM_PROT_WRITE;
3919                                 } else {
3920                                         /*
3921                                          * For a protection that the pmap cares
3922                                          * about, we must hand over the full
3923                                          * set of protections (so that the pmap
3924                                          * layer can apply any desired policy).
3925                                          * This means that cs_bypass must be
3926                                          * set, as this can force us to pass
3927                                          * RWX.
3928                                          */
3929                                         assert(fault_info.cs_bypass);
3930                                 }
3931
3932                                 if (object != cur_object) {
3933                                         /*
3934                                          * We still need to hold the top object
3935                                          * lock here to prevent a race between
3936                                          * a read fault (taking only "shared"
3937                                          * locks) and a write fault (taking
3938                                          * an "exclusive" lock on the top
3939                                          * object.
3940                                          * Otherwise, as soon as we release the
3941                                          * top lock, the write fault could
3942                                          * proceed and actually complete before
3943                                          * the read fault, and the copied page's
3944                                          * translation could then be overwritten
3945                                          * by the read fault's translation for
3946                                          * the original page.
3947                                          *
3948                                          * Let's just record what the top object
3949                                          * is and we'll release it later.
3950                                          */
3951                                         top_object = object;
3952
3953                                         /*
3954                                          * switch to the object that has the new page
3955                                          */
3956                                         object = cur_object;
3957                                         object_lock_type = cur_object_lock_type;
3958                                 }
3959 FastPmapEnter:
3960                                 assert(m_object == VM_PAGE_OBJECT(m));
3961
3962                                 /*
3963                                  * prepare for the pmap_enter...
3964                                  * object and map are both locked
3965                                  * m contains valid data
3966                                  * object == m->object
3967                                  * cur_object == NULL or it's been unlocked
3968                                  * no paging references on either object or cur_object
3969                                  */
3970                                 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE)
3971                                         need_retry_ptr = &need_retry;
3972                                 else
3973                                         need_retry_ptr = NULL;
3974
3975                                 if (caller_pmap) {
3976                                         kr = vm_fault_enter(m,
3977                                                             caller_pmap,
3978                                                             caller_pmap_addr,
3979                                                             prot,
3980                                                             caller_prot,
3981                                                             wired,
3982                                                             change_wiring,
3983                                                             wire_tag,
3984                                                             fault_info.no_cache,
3985                                                             fault_info.cs_bypass,
3986                                                             fault_info.user_tag,
3987                                                             fault_info.pmap_options,
3988                                                             need_retry_ptr,
3989                                                             &type_of_fault);
3990                                 } else {
3991                                         kr = vm_fault_enter(m,
3992                                                             pmap,
3993                                                             vaddr,
3994                                                             prot,
3995                                                             caller_prot,
3996                                                             wired,
3997                                                             change_wiring,
3998                                                             wire_tag,
3999                                                             fault_info.no_cache,
4000                                                             fault_info.cs_bypass,
4001                                                             fault_info.user_tag,
4002                                                             fault_info.pmap_options,
4003                                                             need_retry_ptr,
4004                                                             &type_of_fault);
4005                                 }
4006 #if DEVELOPMENT || DEBUG
4007                                 {
4008                                 int     event_code = 0;
4009
4010                                 if (m_object->internal)
4011                                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
4012                                 else if (m_object->object_slid)
4013                                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
4014                                 else
4015                                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
4016
4017                                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->offset, get_current_unique_pid(), 0);
4018
4019                                 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
4020                                 }
4021 #endif
4022                                 if (kr == KERN_SUCCESS &&
4023                                     physpage_p != NULL) {
4024                                         /* for vm_map_wire_and_extract() */
4025                                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4026                                         if (prot & VM_PROT_WRITE) {
4027                                                 vm_object_lock_assert_exclusive(m_object);
4028                                                 m->dirty = TRUE;
4029                                         }
4030                                 }
4031
4032                                 if (top_object != VM_OBJECT_NULL) {
4033                                         /*
4034                                          * It's safe to drop the top object
4035                                          * now that we've done our
4036                                          * vm_fault_enter().  Any other fault
4037                                          * in progress for that virtual
4038                                          * address will either find our page
4039                                          * and translation or put in a new page
4040                                          * and translation.
4041                                          */
4042                                         vm_object_unlock(top_object);
4043                                         top_object = VM_OBJECT_NULL;
4044                                 }
4045
4046                                 if (need_collapse == TRUE)
4047                                         vm_object_collapse(object, offset, TRUE);
4048
4049                                 if (need_retry == FALSE &&
4050                                     (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
4051                                         /*
4052                                          * evaluate access pattern and update state
4053                                          * vm_fault_deactivate_behind depends on the
4054                                          * state being up to date
4055                                          */
4056                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
4057
4058                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
4059                                 }
4060                                 /*
4061                                  * That's it, clean up and return.
4062                                  */
4063                                 if (m->busy)
4064                                         PAGE_WAKEUP_DONE(m);
4065
4066                                 vm_object_unlock(object);
4067
4068                                 vm_map_unlock_read(map);
4069                                 if (real_map != map)
4070                                         vm_map_unlock(real_map);
4071
4072                                 if (need_retry == TRUE) {
4073                                         /*
4074                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
4075                                          * at this point we don't hold any locks so it's safe
4076                                          * to ask the pmap layer to expand the page table to
4077                                          * accommodate this mapping... once expanded, we'll
4078                                          * re-drive the fault which should result in vm_fault_enter
4079                                          * being able to successfully enter the mapping this time around
4080                                          */
4081                                         (void)pmap_enter_options(
4082                                                 pmap, vaddr, 0, 0, 0, 0, 0,
4083                                                 PMAP_OPTIONS_NOENTER, NULL);
4084
4085                                         need_retry = FALSE;
4086                                         goto RetryFault;
4087                                 }
4088                                 goto done;
4089                         }
4090                         /*
4091                          * COPY ON WRITE FAULT
4092                          */
4093                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4094
4095                         /*
4096                          * If objects match, then
4097                          * object->copy must not be NULL (else control
4098                          * would be in previous code block), and we
4099                          * have a potential push into the copy object
4100                          * with which we can't cope with here.
4101                          */
4102                         if (cur_object == object) {
4103                                 /*
4104                                  * must take the slow path to
4105                                  * deal with the copy push
4106                                  */
4107                                 break;
4108                         }
4109
4110                         /*
4111                          * This is now a shadow based copy on write
4112                          * fault -- it requires a copy up the shadow
4113                          * chain.
4114                          */
4115                         assert(m_object == VM_PAGE_OBJECT(m));
4116
4117                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4118                             VM_FAULT_NEED_CS_VALIDATION(NULL, m, m_object)) {
4119                                 goto upgrade_for_validation;
4120                         }
4121
4122                         /*
4123                          * Allocate a page in the original top level
4124                          * object. Give up if allocate fails.  Also
4125                          * need to remember current page, as it's the
4126                          * source of the copy.
4127                          *
4128                          * at this point we hold locks on both
4129                          * object and cur_object... no need to take
4130                          * paging refs or mark pages BUSY since
4131                          * we don't drop either object lock until
4132                          * the page has been copied and inserted
4133                          */
4134                         cur_m = m;
4135                         m = vm_page_grab_options(grab_options);
4136                         m_object = NULL;
4137
4138                         if (m == VM_PAGE_NULL) {
4139                                 /*
4140                                  * no free page currently available...
4141                                  * must take the slow path
4142                                  */
4143                                 break;
4144                         }
4145                         /*
4146                          * Now do the copy.  Mark the source page busy...
4147                          *
4148                          *      NOTE: This code holds the map lock across
4149                          *      the page copy.
4150                          */
4151                         vm_page_copy(cur_m, m);
4152                         vm_page_insert(m, object, offset);
4153                         m_object = object;
4154                         SET_PAGE_DIRTY(m, FALSE);
4155
4156                         /*
4157                          * Now cope with the source page and object
4158                          */
4159                         if (object->ref_count > 1 && cur_m->pmapped)
4160                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4161
4162                         if (cur_m->clustered) {
4163                                 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4164                                 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4165                                 vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4166                         }
4167                         need_collapse = TRUE;
4168
4169                         if (!cur_object->internal &&
4170                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4171                                 /*
4172                                  * The object from which we've just
4173                                  * copied a page is most probably backed
4174                                  * by a vnode.  We don't want to waste too
4175                                  * much time trying to collapse the VM objects
4176                                  * and create a bottleneck when several tasks
4177                                  * map the same file.
4178                                  */
4179                                 if (cur_object->copy == object) {
4180                                         /*
4181                                          * Shared mapping or no COW yet.
4182                                          * We can never collapse a copy
4183                                          * object into its backing object.
4184                                          */
4185                                         need_collapse = FALSE;
4186                                 } else if (cur_object->copy == object->shadow &&
4187                                            object->shadow->resident_page_count == 0) {
4188                                         /*
4189                                          * Shared mapping after a COW occurred.
4190                                          */
4191                                         need_collapse = FALSE;
4192                                 }
4193                         }
4194                         vm_object_unlock(cur_object);
4195
4196                         if (need_collapse == FALSE)
4197                                 vm_fault_collapse_skipped++;
4198                         vm_fault_collapse_total++;
4199
4200                         type_of_fault = DBG_COW_FAULT;
4201                         VM_STAT_INCR(cow_faults);
4202                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4203                         current_task()->cow_faults++;
4204
4205                         goto FastPmapEnter;
4206
4207                 } else {
4208                         /*
4209                          * No page at cur_object, cur_offset... m == NULL
4210                          */
4211                         if (cur_object->pager_created) {
4212                                 int     compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4213
4214                                 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4215                                         int             my_fault_type;
4216                                         int             c_flags = C_DONT_BLOCK;
4217                                         boolean_t       insert_cur_object = FALSE;
4218
4219                                         /*
4220                                          * May have to talk to a pager...
4221                                          * if so, take the slow path by
4222                                          * doing a 'break' from the while (TRUE) loop
4223                                          *
4224                                          * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4225                                          * if the compressor is active and the page exists there
4226                                          */
4227                                         if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS)
4228                                                 break;
4229
4230                                         if (map == kernel_map || real_map == kernel_map) {
4231                                                 /*
4232                                                  * can't call into the compressor with the kernel_map
4233                                                  * lock held, since the compressor may try to operate
4234                                                  * on the kernel map in order to return an empty c_segment
4235                                                  */
4236                                                 break;
4237                                         }
4238                                         if (object != cur_object) {
4239                                                 if (fault_type & VM_PROT_WRITE)
4240                                                         c_flags |= C_KEEP;
4241                                                 else
4242                                                         insert_cur_object = TRUE;
4243                                         }
4244                                         if (insert_cur_object == TRUE) {
4245
4246                                                 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4247
4248                                                         cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4249
4250                                                         if (vm_object_lock_upgrade(cur_object) == FALSE) {
4251                                                                 /*
4252                                                                  * couldn't upgrade so go do a full retry
4253                                                                  * immediately since we can no longer be
4254                                                                  * certain about cur_object (since we
4255                                                                  * don't hold a reference on it)...
4256                                                                  * first drop the top object lock
4257                                                                  */
4258                                                                 vm_object_unlock(object);
4259
4260                                                                 vm_map_unlock_read(map);
4261                                                                 if (real_map != map)
4262                                                                         vm_map_unlock(real_map);
4263
4264                                                                 goto RetryFault;
4265                                                         }
4266                                                 }
4267                                         } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4268
4269                                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4270
4271                                                 if (object != cur_object) {
4272                                                         /*
4273                                                          * we can't go for the upgrade on the top
4274                                                          * lock since the upgrade may block waiting
4275                                                          * for readers to drain... since we hold
4276                                                          * cur_object locked at this point, waiting
4277                                                          * for the readers to drain would represent
4278                                                          * a lock order inversion since the lock order
4279                                                          * for objects is the reference order in the
4280                                                          * shadown chain
4281                                                          */
4282                                                         vm_object_unlock(object);
4283                                                         vm_object_unlock(cur_object);
4284
4285                                                         vm_map_unlock_read(map);
4286                                                         if (real_map != map)
4287                                                                 vm_map_unlock(real_map);
4288
4289                                                         goto RetryFault;
4290                                                 }
4291                                                 if (vm_object_lock_upgrade(object) == FALSE) {
4292                                                         /*
4293                                                          * couldn't upgrade, so explictly take the lock
4294                                                          * exclusively and go relookup the page since we
4295                                                          * will have dropped the object lock and
4296                                                          * a different thread could have inserted
4297                                                          * a page at this offset
4298                                                          * no need for a full retry since we're
4299                                                          * at the top level of the object chain
4300                                                          */
4301                                                         vm_object_lock(object);
4302
4303                                                         continue;
4304                                                 }
4305                                         }
4306                                         m = vm_page_grab_options(grab_options);
4307                                         m_object = NULL;
4308
4309                                         if (m == VM_PAGE_NULL) {
4310                                                 /*
4311                                                  * no free page currently available...
4312                                                  * must take the slow path
4313                                                  */
4314                                                 break;
4315                                         }
4316
4317                                         /*
4318                                          * The object is and remains locked
4319                                          * so no need to take a
4320                                          * "paging_in_progress" reference.
4321                                          */
4322                                         boolean_t shared_lock;
4323                                         if ((object == cur_object &&
4324                                              object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4325                                             (object != cur_object &&
4326                                              cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4327                                                 shared_lock = FALSE;
4328                                         } else {
4329                                                 shared_lock = TRUE;
4330                                         }
4331
4332                                         kr = vm_compressor_pager_get(
4333                                                 cur_object->pager,
4334                                                 (cur_offset +
4335                                                  cur_object->paging_offset),
4336                                                 VM_PAGE_GET_PHYS_PAGE(m),
4337                                                 &my_fault_type,
4338                                                 c_flags,
4339                                                 &compressed_count_delta);
4340
4341                                         vm_compressor_pager_count(
4342                                                 cur_object->pager,
4343                                                 compressed_count_delta,
4344                                                 shared_lock,
4345                                                 cur_object);
4346
4347                                         if (kr != KERN_SUCCESS) {
4348                                                 vm_page_release(m, FALSE);
4349                                                 m = VM_PAGE_NULL;
4350                                                 break;
4351                                         }
4352                                         m->dirty = TRUE;
4353
4354                                         /*
4355                                          * If the object is purgeable, its
4356                                          * owner's purgeable ledgers will be
4357                                          * updated in vm_page_insert() but the
4358                                          * page was also accounted for in a
4359                                          * "compressed purgeable" ledger, so
4360                                          * update that now.
4361                                          */
4362                                         if (object != cur_object &&
4363                                             !insert_cur_object) {
4364                                                 /*
4365                                                  * We're not going to insert
4366                                                  * the decompressed page into
4367                                                  * the object it came from.
4368                                                  *
4369                                                  * We're dealing with a
4370                                                  * copy-on-write fault on
4371                                                  * "object".
4372                                                  * We're going to decompress
4373                                                  * the page directly into the
4374                                                  * target "object" while
4375                                                  * keepin the compressed
4376                                                  * page for "cur_object", so
4377                                                  * no ledger update in that
4378                                                  * case.
4379                                                  */
4380                                         } else if ((cur_object->purgable ==
4381                                                     VM_PURGABLE_DENY) ||
4382                                                    (cur_object->vo_purgeable_owner ==
4383                                                     NULL)) {
4384                                                 /*
4385                                                  * "cur_object" is not purgeable
4386                                                  * or is not owned, so no
4387                                                  * purgeable ledgers to update.
4388                                                  */
4389                                         } else {
4390                                                 /*
4391                                                  * One less compressed
4392                                                  * purgeable page for
4393                                                  * cur_object's owner.
4394                                                  */
4395                                                 vm_purgeable_compressed_update(
4396                                                         cur_object,
4397                                                         -1);
4398                                         }
4399
4400                                         if (insert_cur_object) {
4401                                                 vm_page_insert(m, cur_object, cur_offset);
4402                                                 m_object = cur_object;
4403                                         } else {
4404                                                 vm_page_insert(m, object, offset);
4405                                                 m_object = object;
4406                                         }
4407
4408                                         if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4409                                                 /*
4410                                                  * If the page is not cacheable,
4411                                                  * we can't let its contents
4412                                                  * linger in the data cache
4413                                                  * after the decompression.
4414                                                  */
4415                                                 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
4416                                         }
4417
4418                                         type_of_fault = my_fault_type;
4419
4420                                         VM_STAT_INCR(decompressions);
4421
4422                                         if (cur_object != object) {
4423                                                 if (insert_cur_object) {
4424                                                         top_object = object;
4425                                                         /*
4426                                                          * switch to the object that has the new page
4427                                                          */
4428                                                         object = cur_object;
4429                                                         object_lock_type = cur_object_lock_type;
4430                                                 } else {
4431                                                         vm_object_unlock(cur_object);
4432                                                         cur_object = object;
4433                                                 }
4434                                         }
4435                                         goto FastPmapEnter;
4436                                 }
4437                                 /*
4438                                  * existence map present and indicates
4439                                  * that the pager doesn't have this page
4440                                  */
4441                         }
4442                         if (cur_object->shadow == VM_OBJECT_NULL) {
4443                                 /*
4444                                  * Zero fill fault.  Page gets
4445                                  * inserted into the original object.
4446                                  */
4447                                 if (cur_object->shadow_severed ||
4448                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
4449                                     cur_object == compressor_object ||
4450                                     cur_object == kernel_object ||
4451                                     cur_object == vm_submap_object) {
4452                                         if (object != cur_object)
4453                                                 vm_object_unlock(cur_object);
4454                                         vm_object_unlock(object);
4455
4456                                         vm_map_unlock_read(map);
4457                                         if (real_map != map)
4458                                                 vm_map_unlock(real_map);
4459
4460                                         kr = KERN_MEMORY_ERROR;
4461                                         goto done;
4462                                 }
4463                                 if (vm_backing_store_low) {
4464                                         /*
4465                                          * we are protecting the system from
4466                                          * backing store exhaustion...
4467                                          * must take the slow path if we're
4468                                          * not privileged
4469                                          */
4470                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
4471                                                 break;
4472                                 }
4473                                 if (cur_object != object) {
4474                                         vm_object_unlock(cur_object);
4475
4476                                         cur_object = object;
4477                                 }
4478                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
4479
4480                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4481
4482                                         if (vm_object_lock_upgrade(object) == FALSE) {
4483                                                 /*
4484                                                  * couldn't upgrade so do a full retry on the fault
4485                                                  * since we dropped the object lock which
4486                                                  * could allow another thread to insert
4487                                                  * a page at this offset
4488                                                  */
4489                                                 vm_map_unlock_read(map);
4490                                                 if (real_map != map)
4491                                                         vm_map_unlock(real_map);
4492
4493                                                 goto RetryFault;
4494                                         }
4495                                 }
4496                                 m = vm_page_alloc(object, offset);
4497                                 m_object = NULL;
4498
4499                                 if (m == VM_PAGE_NULL) {
4500                                         /*
4501                                          * no free page currently available...
4502                                          * must take the slow path
4503                                          */
4504                                         break;
4505                                 }
4506                                 m_object = object;
4507
4508                                 /*
4509                                  * Now zero fill page...
4510                                  * the page is probably going to
4511                                  * be written soon, so don't bother
4512                                  * to clear the modified bit
4513                                  *
4514                                  *   NOTE: This code holds the map
4515                                  *   lock across the zero fill.
4516                                  */
4517                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4518
4519                                 goto FastPmapEnter;
4520                         }
4521                         /*
4522                          * On to the next level in the shadow chain
4523                          */
4524                         cur_offset += cur_object->vo_shadow_offset;
4525                         new_object = cur_object->shadow;
4526
4527                         /*
4528                          * take the new_object's lock with the indicated state
4529                          */
4530                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
4531                                 vm_object_lock_shared(new_object);
4532                         else
4533                                 vm_object_lock(new_object);
4534
4535                         if (cur_object != object)
4536                                 vm_object_unlock(cur_object);
4537
4538                         cur_object = new_object;
4539
4540                         continue;
4541                 }
4542         }
4543         /*
4544          * Cleanup from fast fault failure.  Drop any object
4545          * lock other than original and drop map lock.
4546          */
4547         if (object != cur_object)
4548                 vm_object_unlock(cur_object);
4549
4550         /*
4551          * must own the object lock exclusively at this point
4552          */
4553         if (object_lock_type == OBJECT_LOCK_SHARED) {
4554                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4555
4556                 if (vm_object_lock_upgrade(object) == FALSE) {
4557                         /*
4558                          * couldn't upgrade, so explictly
4559                          * take the lock exclusively
4560                          * no need to retry the fault at this
4561                          * point since "vm_fault_page" will
4562                          * completely re-evaluate the state
4563                          */
4564                         vm_object_lock(object);
4565                 }
4566         }
4567
4568 handle_copy_delay:
4569         vm_map_unlock_read(map);
4570         if (real_map != map)
4571                 vm_map_unlock(real_map);
4572
4573         if (__improbable(object == compressor_object ||
4574                 object == kernel_object ||
4575                 object == vm_submap_object)) {
4576                 /*
4577                  * These objects are explicitly managed and populated by the
4578                  * kernel.  The virtual ranges backed by these objects should
4579                  * either have wired pages or "holes" that are not supposed to
4580                  * be accessed at all until they get explicitly populated.
4581                  * We should never have to resolve a fault on a mapping backed
4582                  * by one of these VM objects and providing a zero-filled page
4583                  * would be wrong here, so let's fail the fault and let the
4584                  * caller crash or recover.
4585                  */
4586                 vm_object_unlock(object);
4587                 kr = KERN_MEMORY_ERROR;
4588                 goto done;
4589         }
4590
4591         assert(object != compressor_object);
4592         assert(object != kernel_object);
4593         assert(object != vm_submap_object);
4594
4595         /*
4596          * Make a reference to this object to
4597          * prevent its disposal while we are messing with
4598          * it.  Once we have the reference, the map is free
4599          * to be diddled.  Since objects reference their
4600          * shadows (and copies), they will stay around as well.
4601          */
4602         vm_object_reference_locked(object);
4603         vm_object_paging_begin(object);
4604
4605         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
4606
4607         error_code = 0;
4608
4609         result_page = VM_PAGE_NULL;
4610         kr = vm_fault_page(object, offset, fault_type,
4611                            (change_wiring && !wired),
4612                            FALSE, /* page not looked up */
4613                            &prot, &result_page, &top_page,
4614                            &type_of_fault,
4615                            &error_code, map->no_zero_fill,
4616                            FALSE, &fault_info);
4617
4618         /*
4619          * if kr != VM_FAULT_SUCCESS, then the paging reference
4620          * has been dropped and the object unlocked... the ref_count
4621          * is still held
4622          *
4623          * if kr == VM_FAULT_SUCCESS, then the paging reference
4624          * is still held along with the ref_count on the original object
4625          *
4626          *      the object is returned locked with a paging reference
4627          *
4628          *      if top_page != NULL, then it's BUSY and the
4629          *      object it belongs to has a paging reference
4630          *      but is returned unlocked
4631          */
4632         if (kr != VM_FAULT_SUCCESS &&
4633             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4634                 /*
4635                  * we didn't succeed, lose the object reference immediately.
4636                  */
4637                 vm_object_deallocate(object);
4638
4639                 /*
4640                  * See why we failed, and take corrective action.
4641                  */
4642                 switch (kr) {
4643                 case VM_FAULT_MEMORY_SHORTAGE:
4644                         if (vm_page_wait((change_wiring) ?
4645                                          THREAD_UNINT :
4646                                          THREAD_ABORTSAFE))
4647                                 goto RetryFault;
4648                         /*
4649                          * fall thru
4650                          */
4651                 case VM_FAULT_INTERRUPTED:
4652                         kr = KERN_ABORTED;
4653                         goto done;
4654                 case VM_FAULT_RETRY:
4655                         goto RetryFault;
4656                 case VM_FAULT_MEMORY_ERROR:
4657                         if (error_code)
4658                                 kr = error_code;
4659                         else
4660                                 kr = KERN_MEMORY_ERROR;
4661                         goto done;
4662                 default:
4663                         panic("vm_fault: unexpected error 0x%x from "
4664                               "vm_fault_page()\n", kr);
4665                 }
4666         }
4667         m = result_page;
4668         m_object = NULL;
4669
4670         if (m != VM_PAGE_NULL) {
4671                 m_object = VM_PAGE_OBJECT(m);
4672                 assert((change_wiring && !wired) ?
4673                        (top_page == VM_PAGE_NULL) :
4674                        ((top_page == VM_PAGE_NULL) == (m_object == object)));
4675         }
4676
4677         /*
4678          * What to do with the resulting page from vm_fault_page
4679          * if it doesn't get entered into the physical map:
4680          */
4681 #define RELEASE_PAGE(m)                                 \
4682         MACRO_BEGIN                                     \
4683         PAGE_WAKEUP_DONE(m);                            \
4684         if ( !VM_PAGE_PAGEABLE(m)) {                    \
4685                 vm_page_lockspin_queues();              \
4686                 if ( !VM_PAGE_PAGEABLE(m))              \
4687                         vm_page_activate(m);            \
4688                 vm_page_unlock_queues();                \
4689         }                                               \
4690         MACRO_END
4691
4692
4693         object_locks_dropped = FALSE;
4694         /*
4695          * We must verify that the maps have not changed
4696          * since our last lookup. vm_map_verify() needs the
4697          * map lock (shared) but we are holding object locks.
4698          * So we do a try_lock() first and, if that fails, we
4699          * drop the object locks and go in for the map lock again.
4700          */
4701         if (!vm_map_try_lock_read(original_map)) {
4702
4703                 if (m != VM_PAGE_NULL) {
4704                         old_copy_object = m_object->copy;
4705                         vm_object_unlock(m_object);
4706                 } else {
4707                         old_copy_object = VM_OBJECT_NULL;
4708                         vm_object_unlock(object);
4709                 }
4710
4711                 object_locks_dropped = TRUE;
4712
4713                 vm_map_lock_read(original_map);
4714         }
4715
4716         if ((map != original_map) || !vm_map_verify(map, &version)) {
4717
4718                 if (object_locks_dropped == FALSE) {
4719                         if (m != VM_PAGE_NULL) {
4720                                 old_copy_object = m_object->copy;
4721                                 vm_object_unlock(m_object);
4722                         } else {
4723                                 old_copy_object = VM_OBJECT_NULL;
4724                                 vm_object_unlock(object);
4725                         }
4726
4727                         object_locks_dropped = TRUE;
4728                 }
4729
4730                 /*
4731                  * no object locks are held at this point
4732                  */
4733                 vm_object_t             retry_object;
4734                 vm_object_offset_t      retry_offset;
4735                 vm_prot_t               retry_prot;
4736
4737                 /*
4738                  * To avoid trying to write_lock the map while another
4739                  * thread has it read_locked (in vm_map_pageable), we
4740                  * do not try for write permission.  If the page is
4741                  * still writable, we will get write permission.  If it
4742                  * is not, or has been marked needs_copy, we enter the
4743                  * mapping without write permission, and will merely
4744                  * take another fault.
4745                  */
4746                 map = original_map;
4747
4748                 kr = vm_map_lookup_locked(&map, vaddr,
4749                                           fault_type & ~VM_PROT_WRITE,
4750                                           OBJECT_LOCK_EXCLUSIVE, &version,
4751                                           &retry_object, &retry_offset, &retry_prot,
4752                                           &wired,
4753                                           &fault_info,
4754                                           &real_map);
4755                 pmap = real_map->pmap;
4756
4757                 if (kr != KERN_SUCCESS) {
4758                         vm_map_unlock_read(map);
4759
4760                         if (m != VM_PAGE_NULL) {
4761                                 assert(VM_PAGE_OBJECT(m) == m_object);
4762
4763                                 /*
4764                                  * retake the lock so that
4765                                  * we can drop the paging reference
4766                                  * in vm_fault_cleanup and do the
4767                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4768                                  */
4769                                 vm_object_lock(m_object);
4770
4771                                 RELEASE_PAGE(m);
4772
4773                                 vm_fault_cleanup(m_object, top_page);
4774                         } else {
4775                                 /*
4776                                  * retake the lock so that
4777                                  * we can drop the paging reference
4778                                  * in vm_fault_cleanup
4779                                  */
4780                                 vm_object_lock(object);
4781
4782                                 vm_fault_cleanup(object, top_page);
4783                         }
4784                         vm_object_deallocate(object);
4785
4786                         goto done;
4787                 }
4788                 vm_object_unlock(retry_object);
4789
4790                 if ((retry_object != object) || (retry_offset != offset)) {
4791
4792                         vm_map_unlock_read(map);
4793                         if (real_map != map)
4794                                 vm_map_unlock(real_map);
4795
4796                         if (m != VM_PAGE_NULL) {
4797                                 assert(VM_PAGE_OBJECT(m) == m_object);
4798
4799                                 /*
4800                                  * retake the lock so that
4801                                  * we can drop the paging reference
4802                                  * in vm_fault_cleanup and do the
4803                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4804                                  */
4805                                 vm_object_lock(m_object);
4806
4807                                 RELEASE_PAGE(m);
4808
4809                                 vm_fault_cleanup(m_object, top_page);
4810                         } else {
4811                                 /*
4812                                  * retake the lock so that
4813                                  * we can drop the paging reference
4814                                  * in vm_fault_cleanup
4815                                  */
4816                                 vm_object_lock(object);
4817
4818                                 vm_fault_cleanup(object, top_page);
4819                         }
4820                         vm_object_deallocate(object);
4821
4822                         goto RetryFault;
4823                 }
4824                 /*
4825                  * Check whether the protection has changed or the object
4826                  * has been copied while we left the map unlocked.
4827                  */
4828                 if (pmap_has_prot_policy(retry_prot)) {
4829                         /* If the pmap layer cares, pass the full set. */
4830                         prot = retry_prot;
4831                 } else {
4832                         prot &= retry_prot;
4833                 }
4834         }
4835
4836         if (object_locks_dropped == TRUE) {
4837                 if (m != VM_PAGE_NULL) {
4838                         vm_object_lock(m_object);
4839
4840                         if (m_object->copy != old_copy_object) {
4841                                 /*
4842                                  * The copy object changed while the top-level object
4843                                  * was unlocked, so take away write permission.
4844                                  */
4845                                 assert(!pmap_has_prot_policy(prot));
4846                                 prot &= ~VM_PROT_WRITE;
4847                         }
4848                 } else
4849                         vm_object_lock(object);
4850
4851                 object_locks_dropped = FALSE;
4852         }
4853
4854         /*
4855          * If we want to wire down this page, but no longer have
4856          * adequate permissions, we must start all over.
4857          */
4858         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
4859
4860                 vm_map_unlock_read(map);
4861                 if (real_map != map)
4862                         vm_map_unlock(real_map);
4863
4864                 if (m != VM_PAGE_NULL) {
4865                         assert(VM_PAGE_OBJECT(m) == m_object);
4866
4867                         RELEASE_PAGE(m);
4868
4869                         vm_fault_cleanup(m_object, top_page);
4870                 } else
4871                         vm_fault_cleanup(object, top_page);
4872
4873                 vm_object_deallocate(object);
4874
4875                 goto RetryFault;
4876         }
4877         if (m != VM_PAGE_NULL) {
4878                 /*
4879                  * Put this page into the physical map.
4880                  * We had to do the unlock above because pmap_enter
4881                  * may cause other faults.  The page may be on
4882                  * the pageout queues.  If the pageout daemon comes
4883                  * across the page, it will remove it from the queues.
4884                  */
4885                 if (caller_pmap) {
4886                         kr = vm_fault_enter(m,
4887                                             caller_pmap,
4888                                             caller_pmap_addr,
4889                                             prot,
4890                                             caller_prot,
4891                                             wired,
4892                                             change_wiring,
4893                                             wire_tag,
4894                                             fault_info.no_cache,
4895                                             fault_info.cs_bypass,
4896                                             fault_info.user_tag,
4897                                             fault_info.pmap_options,
4898                                             NULL,
4899                                             &type_of_fault);
4900                 } else {
4901                         kr = vm_fault_enter(m,
4902                                             pmap,
4903                                             vaddr,
4904                                             prot,
4905                                             caller_prot,
4906                                             wired,
4907                                             change_wiring,
4908                                             wire_tag,
4909                                             fault_info.no_cache,
4910                                             fault_info.cs_bypass,
4911                                             fault_info.user_tag,
4912                                             fault_info.pmap_options,
4913                                             NULL,
4914                                             &type_of_fault);
4915                 }
4916                 assert(VM_PAGE_OBJECT(m) == m_object);
4917
4918 #if DEVELOPMENT || DEBUG
4919         {
4920                 int     event_code = 0;
4921
4922                 if (m_object->internal)
4923                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
4924                 else if (m_object->object_slid)
4925                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
4926                 else
4927                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
4928
4929                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->offset, get_current_unique_pid(), 0);
4930
4931                 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
4932                 }
4933 #endif
4934                 if (kr != KERN_SUCCESS) {
4935                         /* abort this page fault */
4936                         vm_map_unlock_read(map);
4937                         if (real_map != map)
4938                                 vm_map_unlock(real_map);
4939                         PAGE_WAKEUP_DONE(m);
4940                         vm_fault_cleanup(m_object, top_page);
4941                         vm_object_deallocate(object);
4942                         goto done;
4943                 }
4944                 if (physpage_p != NULL) {
4945                         /* for vm_map_wire_and_extract() */
4946                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4947                         if (prot & VM_PROT_WRITE) {
4948                                 vm_object_lock_assert_exclusive(m_object);
4949                                 m->dirty = TRUE;
4950                         }
4951                 }
4952         } else {
4953
4954                 vm_map_entry_t          entry;
4955                 vm_map_offset_t         laddr;
4956                 vm_map_offset_t         ldelta, hdelta;
4957
4958                 /*
4959                  * do a pmap block mapping from the physical address
4960                  * in the object
4961                  */
4962
4963 #ifdef ppc
4964                 /* While we do not worry about execution protection in   */
4965                 /* general, certian pages may have instruction execution */
4966                 /* disallowed.  We will check here, and if not allowed   */
4967                 /* to execute, we return with a protection failure.      */
4968
4969                 if ((fault_type & VM_PROT_EXECUTE) &&
4970                         (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
4971
4972                         vm_map_unlock_read(map);
4973
4974                         if (real_map != map)
4975                                 vm_map_unlock(real_map);
4976
4977                         vm_fault_cleanup(object, top_page);
4978                         vm_object_deallocate(object);
4979
4980                         kr = KERN_PROTECTION_FAILURE;
4981                         goto done;
4982                 }
4983 #endif  /* ppc */
4984
4985                 if (real_map != map)
4986                         vm_map_unlock(real_map);
4987
4988                 if (original_map != map) {
4989                         vm_map_unlock_read(map);
4990                         vm_map_lock_read(original_map);
4991                         map = original_map;
4992                 }
4993                 real_map = map;
4994
4995                 laddr = vaddr;
4996                 hdelta = 0xFFFFF000;
4997                 ldelta = 0xFFFFF000;
4998
4999                 while (vm_map_lookup_entry(map, laddr, &entry)) {
5000                         if (ldelta > (laddr - entry->vme_start))
5001                                 ldelta = laddr - entry->vme_start;
5002                         if (hdelta > (entry->vme_end - laddr))
5003                                 hdelta = entry->vme_end - laddr;
5004                         if (entry->is_sub_map) {
5005
5006                                 laddr = ((laddr - entry->vme_start)
5007                                          + VME_OFFSET(entry));
5008                                 vm_map_lock_read(VME_SUBMAP(entry));
5009
5010                                 if (map != real_map)
5011                                         vm_map_unlock_read(map);
5012                                 if (entry->use_pmap) {
5013                                         vm_map_unlock_read(real_map);
5014                                         real_map = VME_SUBMAP(entry);
5015                                 }
5016                                 map = VME_SUBMAP(entry);
5017
5018                         } else {
5019                                 break;
5020                         }
5021                 }
5022
5023                 if (vm_map_lookup_entry(map, laddr, &entry) &&
5024                     (VME_OBJECT(entry) != NULL) &&
5025                     (VME_OBJECT(entry) == object)) {
5026                         int superpage;
5027
5028                         if (!object->pager_created &&
5029                             object->phys_contiguous &&
5030                             VME_OFFSET(entry) == 0 &&
5031                             (entry->vme_end - entry->vme_start == object->vo_size) &&
5032                             VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size-1))) {
5033                                 superpage = VM_MEM_SUPERPAGE;
5034                         } else {
5035                                 superpage = 0;
5036                         }
5037
5038                         if (superpage && physpage_p) {
5039                                 /* for vm_map_wire_and_extract() */
5040                                 *physpage_p = (ppnum_t)
5041                                         ((((vm_map_offset_t)
5042                                            object->vo_shadow_offset)
5043                                           + VME_OFFSET(entry)
5044                                           + (laddr - entry->vme_start))
5045                                          >> PAGE_SHIFT);
5046                         }
5047
5048                         if (caller_pmap) {
5049                                 /*
5050                                  * Set up a block mapped area
5051                                  */
5052                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5053                                 kr = pmap_map_block(caller_pmap,
5054                                                     (addr64_t)(caller_pmap_addr - ldelta),
5055                                                     (ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
5056                                                                VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5057                                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5058                                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5059
5060                                 if (kr != KERN_SUCCESS) {
5061                                         goto cleanup;
5062                                 }
5063                         } else {
5064                                 /*
5065                                  * Set up a block mapped area
5066                                  */
5067                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5068                                 kr = pmap_map_block(real_map->pmap,
5069                                                     (addr64_t)(vaddr - ldelta),
5070                                                     (ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
5071                                                                VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5072                                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5073                                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5074
5075                                 if (kr != KERN_SUCCESS) {
5076                                         goto cleanup;
5077                                 }
5078                         }
5079                 }
5080         }
5081
5082         /*
5083          * Success
5084          */
5085         kr = KERN_SUCCESS;
5086
5087         /*
5088          * TODO: could most of the done cases just use cleanup?
5089          */
5090 cleanup:
5091         /*
5092          * Unlock everything, and return
5093          */
5094         vm_map_unlock_read(map);
5095         if (real_map != map)
5096                 vm_map_unlock(real_map);
5097
5098         if (m != VM_PAGE_NULL) {
5099                 assert(VM_PAGE_OBJECT(m) == m_object);
5100
5101                 PAGE_WAKEUP_DONE(m);
5102
5103                 vm_fault_cleanup(m_object, top_page);
5104         } else
5105                 vm_fault_cleanup(object, top_page);
5106
5107         vm_object_deallocate(object);
5108
5109 #undef  RELEASE_PAGE
5110
5111 done:
5112         thread_interrupt_level(interruptible_state);
5113
5114         /*
5115          * Only I/O throttle on faults which cause a pagein/swapin.
5116          */
5117         if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
5118                 throttle_lowpri_io(1);
5119         } else {
5120                 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
5121
5122                         if ((throttle_delay = vm_page_throttled(TRUE))) {
5123
5124                                 if (vm_debug_events) {
5125                                         if (type_of_fault == DBG_COMPRESSOR_FAULT)
5126                                                 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5127                                         else if (type_of_fault == DBG_COW_FAULT)
5128                                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5129                                         else
5130                                                 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5131                                 }
5132                                 delay(throttle_delay);
5133                         }
5134                 }
5135         }
5136         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5137                               (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
5138                               ((uint64_t)trace_vaddr >> 32),
5139                               trace_vaddr,
5140                               kr,
5141                               type_of_fault,
5142                               0);
5143
5144         return (kr);
5145 }
5146
5147 /*
5148  *      vm_fault_wire:
5149  *
5150  *      Wire down a range of virtual addresses in a map.
5151  */
5152 kern_return_t
5153 vm_fault_wire(
5154         vm_map_t        map,
5155         vm_map_entry_t  entry,
5156         vm_prot_t       prot,
5157         vm_tag_t        wire_tag,
5158         pmap_t          pmap,
5159         vm_map_offset_t pmap_addr,
5160         ppnum_t         *physpage_p)
5161 {
5162         vm_map_offset_t va;
5163         vm_map_offset_t end_addr = entry->vme_end;
5164         kern_return_t   rc;
5165
5166         assert(entry->in_transition);
5167
5168         if ((VME_OBJECT(entry) != NULL) &&
5169             !entry->is_sub_map &&
5170             VME_OBJECT(entry)->phys_contiguous) {
5171                 return KERN_SUCCESS;
5172         }
5173
5174         /*
5175          *      Inform the physical mapping system that the
5176          *      range of addresses may not fault, so that
5177          *      page tables and such can be locked down as well.
5178          */
5179
5180         pmap_pageable(pmap, pmap_addr,
5181                 pmap_addr + (end_addr - entry->vme_start), FALSE);
5182
5183         /*
5184          *      We simulate a fault to get the page and enter it
5185          *      in the physical map.
5186          */
5187
5188         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5189                 rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
5190                                         pmap_addr + (va - entry->vme_start),
5191                                         physpage_p);
5192                 if (rc != KERN_SUCCESS) {
5193                         rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
5194                                                ((pmap == kernel_pmap)
5195                                                 ? THREAD_UNINT
5196                                                 : THREAD_ABORTSAFE),
5197                                                pmap,
5198                                                (pmap_addr +
5199                                                 (va - entry->vme_start)),
5200                                                physpage_p);
5201                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
5202                 }
5203
5204                 if (rc != KERN_SUCCESS) {
5205                         struct vm_map_entry     tmp_entry = *entry;
5206
5207                         /* unwire wired pages */
5208                         tmp_entry.vme_end = va;
5209                         vm_fault_unwire(map,
5210                                 &tmp_entry, FALSE, pmap, pmap_addr);
5211
5212                         return rc;
5213                 }
5214         }
5215         return KERN_SUCCESS;
5216 }
5217
5218 /*
5219  *      vm_fault_unwire:
5220  *
5221  *      Unwire a range of virtual addresses in a map.
5222  */
5223 void
5224 vm_fault_unwire(
5225         vm_map_t        map,
5226         vm_map_entry_t  entry,
5227         boolean_t       deallocate,
5228         pmap_t          pmap,
5229         vm_map_offset_t pmap_addr)
5230 {
5231         vm_map_offset_t va;
5232         vm_map_offset_t end_addr = entry->vme_end;
5233         vm_object_t             object;
5234         struct vm_object_fault_info fault_info;
5235         unsigned int    unwired_pages;
5236
5237         object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
5238
5239         /*
5240          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
5241          * do anything since such memory is wired by default.  So we don't have
5242          * anything to undo here.
5243          */
5244
5245         if (object != VM_OBJECT_NULL && object->phys_contiguous)
5246                 return;
5247
5248         fault_info.interruptible = THREAD_UNINT;
5249         fault_info.behavior = entry->behavior;
5250         fault_info.user_tag = VME_ALIAS(entry);
5251         fault_info.pmap_options = 0;
5252         if (entry->iokit_acct ||
5253             (!entry->is_sub_map && !entry->use_pmap)) {
5254                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
5255         }
5256         fault_info.lo_offset = VME_OFFSET(entry);
5257         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
5258         fault_info.no_cache = entry->no_cache;
5259         fault_info.stealth = TRUE;
5260         fault_info.io_sync = FALSE;
5261         fault_info.cs_bypass = FALSE;
5262         fault_info.mark_zf_absent = FALSE;
5263         fault_info.batch_pmap_op = FALSE;
5264
5265         unwired_pages = 0;
5266
5267         /*
5268          *      Since the pages are wired down, we must be able to
5269          *      get their mappings from the physical map system.
5270          */
5271
5272         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5273
5274                 if (object == VM_OBJECT_NULL) {
5275                         if (pmap) {
5276                                 pmap_change_wiring(pmap,
5277                                                    pmap_addr + (va - entry->vme_start), FALSE);
5278                         }
5279                         (void) vm_fault(map, va, VM_PROT_NONE,
5280                                         TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
5281                 } else {
5282                         vm_prot_t       prot;
5283                         vm_page_t       result_page;
5284                         vm_page_t       top_page;
5285                         vm_object_t     result_object;
5286                         vm_fault_return_t result;
5287
5288                         if (end_addr - va > (vm_size_t) -1) {
5289                                 /* 32-bit overflow */
5290                                 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5291                         } else {
5292                                 fault_info.cluster_size = (vm_size_t) (end_addr - va);
5293                                 assert(fault_info.cluster_size == end_addr - va);
5294                         }
5295
5296                         do {
5297                                 prot = VM_PROT_NONE;
5298
5299                                 vm_object_lock(object);
5300                                 vm_object_paging_begin(object);
5301                                 XPR(XPR_VM_FAULT,
5302                                         "vm_fault_unwire -> vm_fault_page\n",
5303                                         0,0,0,0,0);
5304                                 result_page = VM_PAGE_NULL;
5305                                 result = vm_fault_page(
5306                                         object,
5307                                         (VME_OFFSET(entry) +
5308                                          (va - entry->vme_start)),
5309                                         VM_PROT_NONE, TRUE,
5310                                         FALSE, /* page not looked up */
5311                                         &prot, &result_page, &top_page,
5312                                         (int *)0,
5313                                         NULL, map->no_zero_fill,
5314                                         FALSE, &fault_info);
5315                         } while (result == VM_FAULT_RETRY);
5316
5317                         /*
5318                          * If this was a mapping to a file on a device that has been forcibly
5319                          * unmounted, then we won't get a page back from vm_fault_page().  Just
5320                          * move on to the next one in case the remaining pages are mapped from
5321                          * different objects.  During a forced unmount, the object is terminated
5322                          * so the alive flag will be false if this happens.  A forced unmount will
5323                          * will occur when an external disk is unplugged before the user does an
5324                          * eject, so we don't want to panic in that situation.
5325                          */
5326
5327                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
5328                                 continue;
5329
5330                         if (result == VM_FAULT_MEMORY_ERROR &&
5331                             object == kernel_object) {
5332                                 /*
5333                                  * This must have been allocated with
5334                                  * KMA_KOBJECT and KMA_VAONLY and there's
5335                                  * no physical page at this offset.
5336                                  * We're done (no page to free).
5337                                  */
5338                                 assert(deallocate);
5339                                 continue;
5340                         }
5341
5342                         if (result != VM_FAULT_SUCCESS)
5343                                 panic("vm_fault_unwire: failure");
5344
5345                         result_object = VM_PAGE_OBJECT(result_page);
5346
5347                         if (deallocate) {
5348                                 assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
5349                                        vm_page_fictitious_addr);
5350                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
5351                                 if (VM_PAGE_WIRED(result_page)) {
5352                                         unwired_pages++;
5353                                 }
5354                                 VM_PAGE_FREE(result_page);
5355                         } else {
5356                                 if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr))
5357                                         pmap_change_wiring(pmap,
5358                                             pmap_addr + (va - entry->vme_start), FALSE);
5359
5360
5361                                 if (VM_PAGE_WIRED(result_page)) {
5362                                         vm_page_lockspin_queues();
5363                                         vm_page_unwire(result_page, TRUE);
5364                                         vm_page_unlock_queues();
5365                                         unwired_pages++;
5366                                 }
5367                                 if(entry->zero_wired_pages) {
5368                                         pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
5369                                         entry->zero_wired_pages = FALSE;
5370                                 }
5371
5372                                 PAGE_WAKEUP_DONE(result_page);
5373                         }
5374                         vm_fault_cleanup(result_object, top_page);
5375                 }
5376         }
5377
5378         /*
5379          *      Inform the physical mapping system that the range
5380          *      of addresses may fault, so that page tables and
5381          *      such may be unwired themselves.
5382          */
5383
5384         pmap_pageable(pmap, pmap_addr,
5385                 pmap_addr + (end_addr - entry->vme_start), TRUE);
5386
5387         if (kernel_object == object) {
5388             vm_tag_update_size(fault_info.user_tag, -ptoa_64(unwired_pages));
5389         }
5390 }
5391
5392 /*
5393  *      vm_fault_wire_fast:
5394  *
5395  *      Handle common case of a wire down page fault at the given address.
5396  *      If successful, the page is inserted into the associated physical map.
5397  *      The map entry is passed in to avoid the overhead of a map lookup.
5398  *
5399  *      NOTE: the given address should be truncated to the
5400  *      proper page address.
5401  *
5402  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
5403  *      a standard error specifying why the fault is fatal is returned.
5404  *
5405  *      The map in question must be referenced, and remains so.
5406  *      Caller has a read lock on the map.
5407  *
5408  *      This is a stripped version of vm_fault() for wiring pages.  Anything
5409  *      other than the common case will return KERN_FAILURE, and the caller
5410  *      is expected to call vm_fault().
5411  */
5412 static kern_return_t
5413 vm_fault_wire_fast(
5414         __unused vm_map_t       map,
5415         vm_map_offset_t va,
5416         __unused vm_prot_t       caller_prot,
5417         vm_tag_t        wire_tag,
5418         vm_map_entry_t  entry,
5419         pmap_t          pmap,
5420         vm_map_offset_t pmap_addr,
5421         ppnum_t         *physpage_p)
5422 {
5423         vm_object_t             object;
5424         vm_object_offset_t      offset;
5425         vm_page_t               m;
5426         vm_prot_t               prot;
5427         thread_t                thread = current_thread();
5428         int                     type_of_fault;
5429         kern_return_t           kr;
5430
5431         VM_STAT_INCR(faults);
5432
5433         if (thread != THREAD_NULL && thread->task != TASK_NULL)
5434           thread->task->faults++;
5435
5436 /*
5437  *      Recovery actions
5438  */
5439
5440 #undef  RELEASE_PAGE
5441 #define RELEASE_PAGE(m) {                               \
5442         PAGE_WAKEUP_DONE(m);                            \
5443         vm_page_lockspin_queues();                      \
5444         vm_page_unwire(m, TRUE);                        \
5445         vm_page_unlock_queues();                        \
5446 }
5447
5448
5449 #undef  UNLOCK_THINGS
5450 #define UNLOCK_THINGS   {                               \
5451         vm_object_paging_end(object);                      \
5452         vm_object_unlock(object);                          \
5453 }
5454
5455 #undef  UNLOCK_AND_DEALLOCATE
5456 #define UNLOCK_AND_DEALLOCATE   {                       \
5457         UNLOCK_THINGS;                                  \
5458         vm_object_deallocate(object);                   \
5459 }
5460 /*
5461  *      Give up and have caller do things the hard way.
5462  */
5463
5464 #define GIVE_UP {                                       \
5465         UNLOCK_AND_DEALLOCATE;                          \
5466         return(KERN_FAILURE);                           \
5467 }
5468
5469
5470         /*
5471          *      If this entry is not directly to a vm_object, bail out.
5472          */
5473         if (entry->is_sub_map) {
5474                 assert(physpage_p == NULL);
5475                 return(KERN_FAILURE);
5476         }
5477
5478         /*
5479          *      Find the backing store object and offset into it.
5480          */
5481
5482         object = VME_OBJECT(entry);
5483         offset = (va - entry->vme_start) + VME_OFFSET(entry);
5484         prot = entry->protection;
5485
5486         /*
5487          *      Make a reference to this object to prevent its
5488          *      disposal while we are messing with it.
5489          */
5490
5491         vm_object_lock(object);
5492         vm_object_reference_locked(object);
5493         vm_object_paging_begin(object);
5494
5495         /*
5496          *      INVARIANTS (through entire routine):
5497          *
5498          *      1)      At all times, we must either have the object
5499          *              lock or a busy page in some object to prevent
5500          *              some other thread from trying to bring in
5501          *              the same page.
5502          *
5503          *      2)      Once we have a busy page, we must remove it from
5504          *              the pageout queues, so that the pageout daemon
5505          *              will not grab it away.
5506          *
5507          */
5508
5509         /*
5510          *      Look for page in top-level object.  If it's not there or
5511          *      there's something going on, give up.
5512          */
5513         m = vm_page_lookup(object, offset);
5514         if ((m == VM_PAGE_NULL) || (m->busy) ||
5515             (m->unusual && ( m->error || m->restart || m->absent))) {
5516
5517                 GIVE_UP;
5518         }
5519         if (m->fictitious &&
5520             VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
5521                 /*
5522                  * Guard pages are fictitious pages and are never
5523                  * entered into a pmap, so let's say it's been wired...
5524                  */
5525                 kr = KERN_SUCCESS;
5526                 goto done;
5527         }
5528
5529         /*
5530          *      Wire the page down now.  All bail outs beyond this
5531          *      point must unwire the page.
5532          */
5533
5534         vm_page_lockspin_queues();
5535         vm_page_wire(m, wire_tag, TRUE);
5536         vm_page_unlock_queues();
5537
5538         /*
5539          *      Mark page busy for other threads.
5540          */
5541         assert(!m->busy);
5542         m->busy = TRUE;
5543         assert(!m->absent);
5544
5545         /*
5546          *      Give up if the page is being written and there's a copy object
5547          */
5548         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
5549                 RELEASE_PAGE(m);
5550                 GIVE_UP;
5551         }
5552
5553         /*
5554          *      Put this page into the physical map.
5555          */
5556         type_of_fault = DBG_CACHE_HIT_FAULT;
5557         kr = vm_fault_enter(m,
5558                             pmap,
5559                             pmap_addr,
5560                             prot,
5561                             prot,
5562                             TRUE,  /* wired */
5563                             FALSE, /* change_wiring */
5564                             wire_tag,
5565                             FALSE, /* no_cache */
5566                             FALSE, /* cs_bypass */
5567                             VME_ALIAS(entry),
5568                             ((entry->iokit_acct ||
5569                               (!entry->is_sub_map && !entry->use_pmap))
5570                              ? PMAP_OPTIONS_ALT_ACCT
5571                              : 0),
5572                             NULL,
5573                             &type_of_fault);
5574         if (kr != KERN_SUCCESS) {
5575                 RELEASE_PAGE(m);
5576                 GIVE_UP;
5577         }
5578
5579 done:
5580         /*
5581          *      Unlock everything, and return
5582          */
5583
5584         if (physpage_p) {
5585                 /* for vm_map_wire_and_extract() */
5586                 if (kr == KERN_SUCCESS) {
5587                         assert(object == VM_PAGE_OBJECT(m));
5588                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
5589                         if (prot & VM_PROT_WRITE) {
5590                                 vm_object_lock_assert_exclusive(object);
5591                                 m->dirty = TRUE;
5592                         }
5593                 } else {
5594                         *physpage_p = 0;
5595                 }
5596         }
5597
5598         PAGE_WAKEUP_DONE(m);
5599         UNLOCK_AND_DEALLOCATE;
5600
5601         return kr;
5602
5603 }
5604
5605 /*
5606  *      Routine:        vm_fault_copy_cleanup
5607  *      Purpose:
5608  *              Release a page used by vm_fault_copy.
5609  */
5610
5611 static void
5612 vm_fault_copy_cleanup(
5613         vm_page_t       page,
5614         vm_page_t       top_page)
5615 {
5616         vm_object_t     object = VM_PAGE_OBJECT(page);
5617
5618         vm_object_lock(object);
5619         PAGE_WAKEUP_DONE(page);
5620         if ( !VM_PAGE_PAGEABLE(page)) {
5621                 vm_page_lockspin_queues();
5622                 if ( !VM_PAGE_PAGEABLE(page)) {
5623                         vm_page_activate(page);
5624                 }
5625                 vm_page_unlock_queues();
5626         }
5627         vm_fault_cleanup(object, top_page);
5628 }
5629
5630 static void
5631 vm_fault_copy_dst_cleanup(
5632         vm_page_t       page)
5633 {
5634         vm_object_t     object;
5635
5636         if (page != VM_PAGE_NULL) {
5637                 object = VM_PAGE_OBJECT(page);
5638                 vm_object_lock(object);
5639                 vm_page_lockspin_queues();
5640                 vm_page_unwire(page, TRUE);
5641                 vm_page_unlock_queues();
5642                 vm_object_paging_end(object);
5643                 vm_object_unlock(object);
5644         }
5645 }
5646
5647 /*
5648  *      Routine:        vm_fault_copy
5649  *
5650  *      Purpose:
5651  *              Copy pages from one virtual memory object to another --
5652  *              neither the source nor destination pages need be resident.
5653  *
5654  *              Before actually copying a page, the version associated with
5655  *              the destination address map wil be verified.
5656  *
5657  *      In/out conditions:
5658  *              The caller must hold a reference, but not a lock, to
5659  *              each of the source and destination objects and to the
5660  *              destination map.
5661  *
5662  *      Results:
5663  *              Returns KERN_SUCCESS if no errors were encountered in
5664  *              reading or writing the data.  Returns KERN_INTERRUPTED if
5665  *              the operation was interrupted (only possible if the
5666  *              "interruptible" argument is asserted).  Other return values
5667  *              indicate a permanent error in copying the data.
5668  *
5669  *              The actual amount of data copied will be returned in the
5670  *              "copy_size" argument.  In the event that the destination map
5671  *              verification failed, this amount may be less than the amount
5672  *              requested.
5673  */
5674 kern_return_t
5675 vm_fault_copy(
5676         vm_object_t             src_object,
5677         vm_object_offset_t      src_offset,
5678         vm_map_size_t           *copy_size,             /* INOUT */
5679         vm_object_t             dst_object,
5680         vm_object_offset_t      dst_offset,
5681         vm_map_t                dst_map,
5682         vm_map_version_t         *dst_version,
5683         int                     interruptible)
5684 {
5685         vm_page_t               result_page;
5686
5687         vm_page_t               src_page;
5688         vm_page_t               src_top_page;
5689         vm_prot_t               src_prot;
5690
5691         vm_page_t               dst_page;
5692         vm_page_t               dst_top_page;
5693         vm_prot_t               dst_prot;
5694
5695         vm_map_size_t           amount_left;
5696         vm_object_t             old_copy_object;
5697         vm_object_t             result_page_object = NULL;
5698         kern_return_t           error = 0;
5699         vm_fault_return_t       result;
5700
5701         vm_map_size_t           part_size;
5702         struct vm_object_fault_info fault_info_src;
5703         struct vm_object_fault_info fault_info_dst;
5704
5705         /*
5706          * In order not to confuse the clustered pageins, align
5707          * the different offsets on a page boundary.
5708          */
5709
5710 #define RETURN(x)                                       \
5711         MACRO_BEGIN                                     \
5712         *copy_size -= amount_left;                      \
5713         MACRO_RETURN(x);                                \
5714         MACRO_END
5715
5716         amount_left = *copy_size;
5717
5718         fault_info_src.interruptible = interruptible;
5719         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5720         fault_info_src.user_tag  = 0;
5721         fault_info_src.pmap_options = 0;
5722         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5723         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5724         fault_info_src.no_cache   = FALSE;
5725         fault_info_src.stealth = TRUE;
5726         fault_info_src.io_sync = FALSE;
5727         fault_info_src.cs_bypass = FALSE;
5728         fault_info_src.mark_zf_absent = FALSE;
5729         fault_info_src.batch_pmap_op = FALSE;
5730
5731         fault_info_dst.interruptible = interruptible;
5732         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5733         fault_info_dst.user_tag  = 0;
5734         fault_info_dst.pmap_options = 0;
5735         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5736         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5737         fault_info_dst.no_cache   = FALSE;
5738         fault_info_dst.stealth = TRUE;
5739         fault_info_dst.io_sync = FALSE;
5740         fault_info_dst.cs_bypass = FALSE;
5741         fault_info_dst.mark_zf_absent = FALSE;
5742         fault_info_dst.batch_pmap_op = FALSE;
5743
5744         do { /* while (amount_left > 0) */
5745                 /*
5746                  * There may be a deadlock if both source and destination
5747                  * pages are the same. To avoid this deadlock, the copy must
5748                  * start by getting the destination page in order to apply
5749                  * COW semantics if any.
5750                  */
5751
5752         RetryDestinationFault: ;
5753
5754                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
5755
5756                 vm_object_lock(dst_object);
5757                 vm_object_paging_begin(dst_object);
5758
5759                 if (amount_left > (vm_size_t) -1) {
5760                         /* 32-bit overflow */
5761                         fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5762                 } else {
5763                         fault_info_dst.cluster_size = (vm_size_t) amount_left;
5764                         assert(fault_info_dst.cluster_size == amount_left);
5765                 }
5766
5767                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
5768                 dst_page = VM_PAGE_NULL;
5769                 result = vm_fault_page(dst_object,
5770                                        vm_object_trunc_page(dst_offset),
5771                                        VM_PROT_WRITE|VM_PROT_READ,
5772                                        FALSE,
5773                                        FALSE, /* page not looked up */
5774                                        &dst_prot, &dst_page, &dst_top_page,
5775                                        (int *)0,
5776                                        &error,
5777                                        dst_map->no_zero_fill,
5778                                        FALSE, &fault_info_dst);
5779                 switch (result) {
5780                 case VM_FAULT_SUCCESS:
5781                         break;
5782                 case VM_FAULT_RETRY:
5783                         goto RetryDestinationFault;
5784                 case VM_FAULT_MEMORY_SHORTAGE:
5785                         if (vm_page_wait(interruptible))
5786                                 goto RetryDestinationFault;
5787                         /* fall thru */
5788                 case VM_FAULT_INTERRUPTED:
5789                         RETURN(MACH_SEND_INTERRUPTED);
5790                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5791                         /* success but no VM page: fail the copy */
5792                         vm_object_paging_end(dst_object);
5793                         vm_object_unlock(dst_object);
5794                         /*FALLTHROUGH*/
5795                 case VM_FAULT_MEMORY_ERROR:
5796                         if (error)
5797                                 return (error);
5798                         else
5799                                 return(KERN_MEMORY_ERROR);
5800                 default:
5801                         panic("vm_fault_copy: unexpected error 0x%x from "
5802                               "vm_fault_page()\n", result);
5803                 }
5804                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5805
5806                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
5807                 old_copy_object = dst_object->copy;
5808
5809                 /*
5810                  * There exists the possiblity that the source and
5811                  * destination page are the same.  But we can't
5812                  * easily determine that now.  If they are the
5813                  * same, the call to vm_fault_page() for the
5814                  * destination page will deadlock.  To prevent this we
5815                  * wire the page so we can drop busy without having
5816                  * the page daemon steal the page.  We clean up the
5817                  * top page  but keep the paging reference on the object
5818                  * holding the dest page so it doesn't go away.
5819                  */
5820
5821                 vm_page_lockspin_queues();
5822                 vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
5823                 vm_page_unlock_queues();
5824                 PAGE_WAKEUP_DONE(dst_page);
5825                 vm_object_unlock(dst_object);
5826
5827                 if (dst_top_page != VM_PAGE_NULL) {
5828                         vm_object_lock(dst_object);
5829                         VM_PAGE_FREE(dst_top_page);
5830                         vm_object_paging_end(dst_object);
5831                         vm_object_unlock(dst_object);
5832                 }
5833
5834         RetrySourceFault: ;
5835
5836                 if (src_object == VM_OBJECT_NULL) {
5837                         /*
5838                          *      No source object.  We will just
5839                          *      zero-fill the page in dst_object.
5840                          */
5841                         src_page = VM_PAGE_NULL;
5842                         result_page = VM_PAGE_NULL;
5843                 } else {
5844                         vm_object_lock(src_object);
5845                         src_page = vm_page_lookup(src_object,
5846                                                   vm_object_trunc_page(src_offset));
5847                         if (src_page == dst_page) {
5848                                 src_prot = dst_prot;
5849                                 result_page = VM_PAGE_NULL;
5850                         } else {
5851                                 src_prot = VM_PROT_READ;
5852                                 vm_object_paging_begin(src_object);
5853
5854                                 if (amount_left > (vm_size_t) -1) {
5855                                         /* 32-bit overflow */
5856                                         fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5857                                 } else {
5858                                         fault_info_src.cluster_size = (vm_size_t) amount_left;
5859                                         assert(fault_info_src.cluster_size == amount_left);
5860                                 }
5861
5862                                 XPR(XPR_VM_FAULT,
5863                                         "vm_fault_copy(2) -> vm_fault_page\n",
5864                                         0,0,0,0,0);
5865                                 result_page = VM_PAGE_NULL;
5866                                 result = vm_fault_page(
5867                                         src_object,
5868                                         vm_object_trunc_page(src_offset),
5869                                         VM_PROT_READ, FALSE,
5870                                         FALSE, /* page not looked up */
5871                                         &src_prot,
5872                                         &result_page, &src_top_page,
5873                                         (int *)0, &error, FALSE,
5874                                         FALSE, &fault_info_src);
5875
5876                                 switch (result) {
5877                                 case VM_FAULT_SUCCESS:
5878                                         break;
5879                                 case VM_FAULT_RETRY:
5880                                         goto RetrySourceFault;
5881                                 case VM_FAULT_MEMORY_SHORTAGE:
5882                                         if (vm_page_wait(interruptible))
5883                                                 goto RetrySourceFault;
5884                                         /* fall thru */
5885                                 case VM_FAULT_INTERRUPTED:
5886                                         vm_fault_copy_dst_cleanup(dst_page);
5887                                         RETURN(MACH_SEND_INTERRUPTED);
5888                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5889                                         /* success but no VM page: fail */
5890                                         vm_object_paging_end(src_object);
5891                                         vm_object_unlock(src_object);
5892                                         /*FALLTHROUGH*/
5893                                 case VM_FAULT_MEMORY_ERROR:
5894                                         vm_fault_copy_dst_cleanup(dst_page);
5895                                         if (error)
5896                                                 return (error);
5897                                         else
5898                                                 return(KERN_MEMORY_ERROR);
5899                                 default:
5900                                         panic("vm_fault_copy(2): unexpected "
5901                                               "error 0x%x from "
5902                                               "vm_fault_page()\n", result);
5903                                 }
5904
5905                                 result_page_object = VM_PAGE_OBJECT(result_page);
5906                                 assert((src_top_page == VM_PAGE_NULL) ==
5907                                        (result_page_object == src_object));
5908                         }
5909                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
5910                         vm_object_unlock(result_page_object);
5911                 }
5912
5913                 vm_map_lock_read(dst_map);
5914
5915                 if (!vm_map_verify(dst_map, dst_version)) {
5916                         vm_map_unlock_read(dst_map);
5917                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5918                                 vm_fault_copy_cleanup(result_page, src_top_page);
5919                         vm_fault_copy_dst_cleanup(dst_page);
5920                         break;
5921                 }
5922                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
5923
5924                 vm_object_lock(dst_object);
5925
5926                 if (dst_object->copy != old_copy_object) {
5927                         vm_object_unlock(dst_object);
5928                         vm_map_unlock_read(dst_map);
5929                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5930                                 vm_fault_copy_cleanup(result_page, src_top_page);
5931                         vm_fault_copy_dst_cleanup(dst_page);
5932                         break;
5933                 }
5934                 vm_object_unlock(dst_object);
5935
5936                 /*
5937                  *      Copy the page, and note that it is dirty
5938                  *      immediately.
5939                  */
5940
5941                 if (!page_aligned(src_offset) ||
5942                         !page_aligned(dst_offset) ||
5943                         !page_aligned(amount_left)) {
5944
5945                         vm_object_offset_t      src_po,
5946                                                 dst_po;
5947
5948                         src_po = src_offset - vm_object_trunc_page(src_offset);
5949                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
5950
5951                         if (dst_po > src_po) {
5952                                 part_size = PAGE_SIZE - dst_po;
5953                         } else {
5954                                 part_size = PAGE_SIZE - src_po;
5955                         }
5956                         if (part_size > (amount_left)){
5957                                 part_size = amount_left;
5958                         }
5959
5960                         if (result_page == VM_PAGE_NULL) {
5961                                 assert((vm_offset_t) dst_po == dst_po);
5962                                 assert((vm_size_t) part_size == part_size);
5963                                 vm_page_part_zero_fill(dst_page,
5964                                                        (vm_offset_t) dst_po,
5965                                                        (vm_size_t) part_size);
5966                         } else {
5967                                 assert((vm_offset_t) src_po == src_po);
5968                                 assert((vm_offset_t) dst_po == dst_po);
5969                                 assert((vm_size_t) part_size == part_size);
5970                                 vm_page_part_copy(result_page,
5971                                                   (vm_offset_t) src_po,
5972                                                   dst_page,
5973                                                   (vm_offset_t) dst_po,
5974                                                   (vm_size_t)part_size);
5975                                 if(!dst_page->dirty){
5976                                         vm_object_lock(dst_object);
5977                                         SET_PAGE_DIRTY(dst_page, TRUE);
5978                                         vm_object_unlock(dst_object);
5979                                 }
5980
5981                         }
5982                 } else {
5983                         part_size = PAGE_SIZE;
5984
5985                         if (result_page == VM_PAGE_NULL)
5986                                 vm_page_zero_fill(dst_page);
5987                         else{
5988                                 vm_object_lock(result_page_object);
5989                                 vm_page_copy(result_page, dst_page);
5990                                 vm_object_unlock(result_page_object);
5991
5992                                 if(!dst_page->dirty){
5993                                         vm_object_lock(dst_object);
5994                                         SET_PAGE_DIRTY(dst_page, TRUE);
5995                                         vm_object_unlock(dst_object);
5996                                 }
5997                         }
5998
5999                 }
6000
6001                 /*
6002                  *      Unlock everything, and return
6003                  */
6004
6005                 vm_map_unlock_read(dst_map);
6006
6007                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
6008                         vm_fault_copy_cleanup(result_page, src_top_page);
6009                 vm_fault_copy_dst_cleanup(dst_page);
6010
6011                 amount_left -= part_size;
6012                 src_offset += part_size;
6013                 dst_offset += part_size;
6014         } while (amount_left > 0);
6015
6016         RETURN(KERN_SUCCESS);
6017 #undef  RETURN
6018
6019         /*NOTREACHED*/
6020 }
6021
6022 #if     VM_FAULT_CLASSIFY
6023 /*
6024  *      Temporary statistics gathering support.
6025  */
6026
6027 /*
6028  *      Statistics arrays:
6029  */
6030 #define VM_FAULT_TYPES_MAX      5
6031 #define VM_FAULT_LEVEL_MAX      8
6032
6033 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
6034
6035 #define VM_FAULT_TYPE_ZERO_FILL 0
6036 #define VM_FAULT_TYPE_MAP_IN    1
6037 #define VM_FAULT_TYPE_PAGER     2
6038 #define VM_FAULT_TYPE_COPY      3
6039 #define VM_FAULT_TYPE_OTHER     4
6040
6041
6042 void
6043 vm_fault_classify(vm_object_t           object,
6044                   vm_object_offset_t    offset,
6045                   vm_prot_t             fault_type)
6046 {
6047         int             type, level = 0;
6048         vm_page_t       m;
6049
6050         while (TRUE) {
6051                 m = vm_page_lookup(object, offset);
6052                 if (m != VM_PAGE_NULL) {
6053                         if (m->busy || m->error || m->restart || m->absent) {
6054                                 type = VM_FAULT_TYPE_OTHER;
6055                                 break;
6056                         }
6057                         if (((fault_type & VM_PROT_WRITE) == 0) ||
6058                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
6059                                 type = VM_FAULT_TYPE_MAP_IN;
6060                                 break;
6061                         }
6062                         type = VM_FAULT_TYPE_COPY;
6063                         break;
6064                 }
6065                 else {
6066                         if (object->pager_created) {
6067                                 type = VM_FAULT_TYPE_PAGER;
6068                                 break;
6069                         }
6070                         if (object->shadow == VM_OBJECT_NULL) {
6071                                 type = VM_FAULT_TYPE_ZERO_FILL;
6072                                 break;
6073                         }
6074
6075                         offset += object->vo_shadow_offset;
6076                         object = object->shadow;
6077                         level++;
6078                         continue;
6079                 }
6080         }
6081
6082         if (level > VM_FAULT_LEVEL_MAX)
6083                 level = VM_FAULT_LEVEL_MAX;
6084
6085         vm_fault_stats[type][level] += 1;
6086
6087         return;
6088 }
6089
6090 /* cleanup routine to call from debugger */
6091
6092 void
6093 vm_fault_classify_init(void)
6094 {
6095         int type, level;
6096
6097         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
6098                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
6099                         vm_fault_stats[type][level] = 0;
6100                 }
6101         }
6102
6103         return;
6104 }
6105 #endif  /* VM_FAULT_CLASSIFY */
6106
6107 vm_offset_t
6108 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
6109 {
6110         vm_map_entry_t  entry;
6111         vm_object_t     object;
6112         vm_offset_t     object_offset;
6113         vm_page_t       m;
6114         int             compressor_external_state, compressed_count_delta;
6115         int             compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
6116         int             my_fault_type = VM_PROT_READ;
6117         kern_return_t   kr;
6118
6119         if (not_in_kdp) {
6120                 panic("kdp_lightweight_fault called from outside of debugger context");
6121         }
6122
6123         assert(map != VM_MAP_NULL);
6124
6125         assert((cur_target_addr & PAGE_MASK) == 0);
6126         if ((cur_target_addr & PAGE_MASK) != 0) {
6127                 return 0;
6128         }
6129
6130         if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
6131                 return 0;
6132         }
6133
6134         if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
6135                 return 0;
6136         }
6137
6138         if (entry->is_sub_map) {
6139                 return 0;
6140         }
6141
6142         object = VME_OBJECT(entry);
6143         if (object == VM_OBJECT_NULL) {
6144                 return 0;
6145         }
6146
6147         object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
6148
6149         while (TRUE) {
6150                 if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
6151                         return 0;
6152                 }
6153
6154                 if (object->pager_created && (object->paging_in_progress ||
6155                         object->activity_in_progress)) {
6156                         return 0;
6157                 }
6158
6159                 m = kdp_vm_page_lookup(object, object_offset);
6160
6161                 if (m != VM_PAGE_NULL) {
6162
6163                         if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
6164                                 return 0;
6165                         }
6166
6167                         if (m->laundry || m->busy || m->free_when_done || m->absent || m->error || m->cleaning ||
6168                                 m->overwriting || m->restart || m->unusual) {
6169                                 return 0;
6170                         }
6171
6172                         assert(!m->private);
6173                         if (m->private) {
6174                                 return 0;
6175                         }
6176
6177                         assert(!m->fictitious);
6178                         if (m->fictitious) {
6179                                 return 0;
6180                         }
6181
6182                         assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
6183                         if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6184                                 return 0;
6185                         }
6186
6187                         return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
6188                 }
6189
6190                 compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
6191
6192                 if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
6193                         if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
6194                                 kr = vm_compressor_pager_get(object->pager, (object_offset + object->paging_offset),
6195                                                                 kdp_compressor_decompressed_page_ppnum, &my_fault_type,
6196                                                                 compressor_flags, &compressed_count_delta);
6197                                 if (kr == KERN_SUCCESS) {
6198                                         return kdp_compressor_decompressed_page_paddr;
6199                                 } else {
6200                                         return 0;
6201                                 }
6202                         }
6203                 }
6204
6205                 if (object->shadow == VM_OBJECT_NULL) {
6206                         return 0;
6207                 }
6208
6209                 object_offset += object->vo_shadow_offset;
6210                 object = object->shadow;
6211         }
6212
6213 }
6214
6215 void
6216 vm_page_validate_cs_mapped(
6217         vm_page_t       page,
6218         const void      *kaddr)
6219 {
6220         vm_object_t             object;
6221         vm_object_offset_t      offset;
6222         memory_object_t         pager;
6223         struct vnode            *vnode;
6224         boolean_t               validated;
6225         unsigned                tainted;
6226
6227         assert(page->busy);
6228         object = VM_PAGE_OBJECT(page);
6229         vm_object_lock_assert_exclusive(object);
6230
6231         if (page->wpmapped && !page->cs_tainted) {
6232                 /*
6233                  * This page was mapped for "write" access sometime in the
6234                  * past and could still be modifiable in the future.
6235                  * Consider it tainted.
6236                  * [ If the page was already found to be "tainted", no
6237                  * need to re-validate. ]
6238                  */
6239                 page->cs_validated = TRUE;
6240                 page->cs_tainted = TRUE;
6241                 if (cs_debug) {
6242                         printf("CODESIGNING: vm_page_validate_cs: "
6243                                "page %p obj %p off 0x%llx "
6244                                "was modified\n",
6245                                page, object, page->offset);
6246                 }
6247                 vm_cs_validated_dirtied++;
6248         }
6249
6250         if (page->cs_validated || page->cs_tainted) {
6251                 return;
6252         }
6253
6254         vm_cs_validates++;
6255
6256         assert(object->code_signed);
6257         offset = page->offset;
6258
6259         if (!object->alive || object->terminating || object->pager == NULL) {
6260                 /*
6261                  * The object is terminating and we don't have its pager
6262                  * so we can't validate the data...
6263                  */
6264                 return;
6265         }
6266         /*
6267          * Since we get here to validate a page that was brought in by
6268          * the pager, we know that this pager is all setup and ready
6269          * by now.
6270          */
6271         assert(!object->internal);
6272         assert(object->pager != NULL);
6273         assert(object->pager_ready);
6274
6275         pager = object->pager;
6276         assert(object->paging_in_progress);
6277         vnode = vnode_pager_lookup_vnode(pager);
6278
6279         /* verify the SHA1 hash for this page */
6280         tainted = 0;
6281         validated = cs_validate_range(vnode,
6282                                       pager,
6283                                       (object->paging_offset +
6284                                        offset),
6285                                       (const void *)((const char *)kaddr),
6286                                       PAGE_SIZE_64,
6287                                       &tainted);
6288
6289         if (tainted & CS_VALIDATE_TAINTED) {
6290                 page->cs_tainted = TRUE;
6291         }
6292         if (tainted & CS_VALIDATE_NX) {
6293                 page->cs_nx = TRUE;
6294         }
6295
6296         if (validated) {
6297                 page->cs_validated = TRUE;
6298         }
6299 }
6300
6301 void
6302 vm_page_validate_cs(
6303         vm_page_t       page)
6304 {
6305         vm_object_t             object;
6306         vm_object_offset_t      offset;
6307         vm_map_offset_t         koffset;
6308         vm_map_size_t           ksize;
6309         vm_offset_t             kaddr;
6310         kern_return_t           kr;
6311         boolean_t               busy_page;
6312         boolean_t               need_unmap;
6313
6314         object = VM_PAGE_OBJECT(page);
6315         vm_object_lock_assert_held(object);
6316
6317         if (page->wpmapped && !page->cs_tainted) {
6318                 vm_object_lock_assert_exclusive(object);
6319
6320                 /*
6321                  * This page was mapped for "write" access sometime in the
6322                  * past and could still be modifiable in the future.
6323                  * Consider it tainted.
6324                  * [ If the page was already found to be "tainted", no
6325                  * need to re-validate. ]
6326                  */
6327                 page->cs_validated = TRUE;
6328                 page->cs_tainted = TRUE;
6329                 if (cs_debug) {
6330                         printf("CODESIGNING: vm_page_validate_cs: "
6331                                "page %p obj %p off 0x%llx "
6332                                "was modified\n",
6333                                page, object, page->offset);
6334                 }
6335                 vm_cs_validated_dirtied++;
6336         }
6337
6338         if (page->cs_validated || page->cs_tainted) {
6339                 return;
6340         }
6341
6342         if (page->slid) {
6343                 panic("vm_page_validate_cs(%p): page is slid\n", page);
6344         }
6345         assert(!page->slid);
6346
6347 #if CHECK_CS_VALIDATION_BITMAP
6348         if ( vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page(page->offset + object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
6349                 page->cs_validated = TRUE;
6350                 page->cs_tainted = FALSE;
6351                 vm_cs_bitmap_validated++;
6352                 return;
6353         }
6354 #endif
6355         vm_object_lock_assert_exclusive(object);
6356
6357         assert(object->code_signed);
6358         offset = page->offset;
6359
6360         busy_page = page->busy;
6361         if (!busy_page) {
6362                 /* keep page busy while we map (and unlock) the VM object */
6363                 page->busy = TRUE;
6364         }
6365
6366         /*
6367          * Take a paging reference on the VM object
6368          * to protect it from collapse or bypass,
6369          * and keep it from disappearing too.
6370          */
6371         vm_object_paging_begin(object);
6372
6373         /* map the page in the kernel address space */
6374         ksize = PAGE_SIZE_64;
6375         koffset = 0;
6376         need_unmap = FALSE;
6377         kr = vm_paging_map_object(page,
6378                                   object,
6379                                   offset,
6380                                   VM_PROT_READ,
6381                                   FALSE, /* can't unlock object ! */
6382                                   &ksize,
6383                                   &koffset,
6384                                   &need_unmap);
6385         if (kr != KERN_SUCCESS) {
6386                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
6387         }
6388         kaddr = CAST_DOWN(vm_offset_t, koffset);
6389
6390         /* validate the mapped page */
6391         vm_page_validate_cs_mapped(page, (const void *) kaddr);
6392
6393 #if CHECK_CS_VALIDATION_BITMAP
6394         if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
6395                 vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
6396         }
6397 #endif
6398         assert(page->busy);
6399         assert(object == VM_PAGE_OBJECT(page));
6400         vm_object_lock_assert_exclusive(object);
6401
6402         if (!busy_page) {
6403                 PAGE_WAKEUP_DONE(page);
6404         }
6405         if (need_unmap) {
6406                 /* unmap the map from the kernel address space */
6407                 vm_paging_unmap_object(object, koffset, koffset + ksize);
6408                 koffset = 0;
6409                 ksize = 0;
6410                 kaddr = 0;
6411         }
6412         vm_object_paging_end(object);
6413 }
6414
6415 void
6416 vm_page_validate_cs_mapped_chunk(
6417         vm_page_t       page,
6418         const void      *kaddr,
6419         vm_offset_t     chunk_offset,
6420         vm_size_t       chunk_size,
6421         boolean_t       *validated_p,
6422         unsigned        *tainted_p)
6423 {
6424         vm_object_t             object;
6425         vm_object_offset_t      offset, offset_in_page;
6426         memory_object_t         pager;
6427         struct vnode            *vnode;
6428         boolean_t               validated;
6429         unsigned                tainted;
6430
6431         *validated_p = FALSE;
6432         *tainted_p = 0;
6433
6434         assert(page->busy);
6435         object = VM_PAGE_OBJECT(page);
6436         vm_object_lock_assert_exclusive(object);
6437
6438         assert(object->code_signed);
6439         offset = page->offset;
6440
6441         if (!object->alive || object->terminating || object->pager == NULL) {
6442                 /*
6443                  * The object is terminating and we don't have its pager
6444                  * so we can't validate the data...
6445                  */
6446                 return;
6447         }
6448         /*
6449          * Since we get here to validate a page that was brought in by
6450          * the pager, we know that this pager is all setup and ready
6451          * by now.
6452          */
6453         assert(!object->internal);
6454         assert(object->pager != NULL);
6455         assert(object->pager_ready);
6456
6457         pager = object->pager;
6458         assert(object->paging_in_progress);
6459         vnode = vnode_pager_lookup_vnode(pager);
6460
6461         /* verify the signature for this chunk */
6462         offset_in_page = chunk_offset;
6463         assert(offset_in_page < PAGE_SIZE);
6464
6465         tainted = 0;
6466         validated = cs_validate_range(vnode,
6467                                       pager,
6468                                       (object->paging_offset +
6469                                        offset +
6470                                        offset_in_page),
6471                                       (const void *)((const char *)kaddr
6472                                                     + offset_in_page),
6473                                       chunk_size,
6474                                       &tainted);
6475         if (validated) {
6476                 *validated_p = TRUE;
6477         }
6478         if (tainted) {
6479                 *tainted_p = tainted;
6480         }
6481 }