osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <mach_kdb.h>
  68 #include <libkern/OSAtomic.h>
  69
  70 #include <mach/mach_types.h>
  71 #include <mach/kern_return.h>
  72 #include <mach/message.h>       /* for error codes */
  73 #include <mach/vm_param.h>
  74 #include <mach/vm_behavior.h>
  75 #include <mach/memory_object.h>
  76                                 /* For memory_object_data_{request,unlock} */
  77 #include <mach/sdt.h>
  78
  79 #include <kern/kern_types.h>
  80 #include <kern/host_statistics.h>
  81 #include <kern/counters.h>
  82 #include <kern/task.h>
  83 #include <kern/thread.h>
  84 #include <kern/sched_prim.h>
  85 #include <kern/host.h>
  86 #include <kern/xpr.h>
  87 #include <kern/mach_param.h>
  88 #include <kern/macro_help.h>
  89 #include <kern/zalloc.h>
  90 #include <kern/misc_protos.h>
  91
  92 #include <ppc/proc_reg.h>
  93
  94 #include <vm/vm_fault.h>
  95 #include <vm/vm_map.h>
  96 #include <vm/vm_object.h>
  97 #include <vm/vm_page.h>
  98 #include <vm/vm_kern.h>
  99 #include <vm/pmap.h>
 100 #include <vm/vm_pageout.h>
 101 #include <vm/vm_protos.h>
 102 #include <vm/vm_external.h>
 103 #include <vm/memory_object.h>
 104 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 105
 106 #include <sys/kdebug.h>
 107
 108 #define VM_FAULT_CLASSIFY       0
 109
 110 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 111
 112 int     vm_object_pagein_throttle = 16;
 113
 114 /*
 115  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 116  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 117  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 118  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 119  * keep the UI active so that the user has a chance to kill the offending task before the system
 120  * completely hangs.
 121  *
 122  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 123  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 124  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 125  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 126  */
 127
 128 boolean_t thread_is_io_throttled(void);
 129
 130 uint64_t vm_hard_throttle_threshold;
 131
 132 extern unsigned int dp_pages_free, dp_pages_reserve;
 133
 134 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (((dp_pages_free + dp_pages_reserve < 2000) && \
 135                                                  (get_task_resident_size(current_task()) > vm_hard_throttle_threshold) && \
 136                                                  (current_task() != kernel_task) && IP_VALID(memory_manager_default)) || \
 137                                                  (vm_page_free_count < vm_page_throttle_limit && thread_is_io_throttled() && \
 138                                                   (get_task_resident_size(current_task()) > vm_hard_throttle_threshold)))
 139
 140
 141 #define HARD_THROTTLE_DELAY     10000   /* 10000 us == 10 ms */
 142
 143
 144 extern int cs_debug;
 145
 146 #if     MACH_KDB
 147 extern struct db_watchpoint *db_watchpoint_list;
 148 #endif  /* MACH_KDB */
 149
 150 boolean_t current_thread_aborted(void);
 151
 152 /* Forward declarations of internal routines. */
 153 extern kern_return_t vm_fault_wire_fast(
 154                                 vm_map_t        map,
 155                                 vm_map_offset_t va,
 156                                 vm_map_entry_t  entry,
 157                                 pmap_t          pmap,
 158                                 vm_map_offset_t pmap_addr);
 159
 160 extern void vm_fault_continue(void);
 161
 162 extern void vm_fault_copy_cleanup(
 163                                 vm_page_t       page,
 164                                 vm_page_t       top_page);
 165
 166 extern void vm_fault_copy_dst_cleanup(
 167                                 vm_page_t       page);
 168
 169 #if     VM_FAULT_CLASSIFY
 170 extern void vm_fault_classify(vm_object_t       object,
 171                           vm_object_offset_t    offset,
 172                           vm_prot_t             fault_type);
 173
 174 extern void vm_fault_classify_init(void);
 175 #endif
 176
 177 unsigned long vm_pmap_enter_blocked = 0;
 178
 179 unsigned long vm_cs_validates = 0;
 180 unsigned long vm_cs_revalidates = 0;
 181 unsigned long vm_cs_query_modified = 0;
 182 unsigned long vm_cs_validated_dirtied = 0;
 183 #if CONFIG_ENFORCE_SIGNED_CODE
 184 int cs_enforcement_disable=0;
 185 #else
 186 static const int cs_enforcement_disable=1;
 187 #endif
 188
 189 /*
 190  *      Routine:        vm_fault_init
 191  *      Purpose:
 192  *              Initialize our private data structures.
 193  */
 194 void
 195 vm_fault_init(void)
 196 {
 197 #if !SECURE_KERNEL
 198 #if CONFIG_ENFORCE_SIGNED_CODE
 199         PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable,
 200                            sizeof (cs_enforcement_disable));
 201 #endif
 202         PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug));
 203 #endif
 204
 205         /*
 206          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 207          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 208          * the amount of memory.  The pertange runs between 10% and 35%.  We use 35% for small memory systems
 209          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 210          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 211          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 212          */
 213
 214         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
 215 }
 216
 217 /*
 218  *      Routine:        vm_fault_cleanup
 219  *      Purpose:
 220  *              Clean up the result of vm_fault_page.
 221  *      Results:
 222  *              The paging reference for "object" is released.
 223  *              "object" is unlocked.
 224  *              If "top_page" is not null,  "top_page" is
 225  *              freed and the paging reference for the object
 226  *              containing it is released.
 227  *
 228  *      In/out conditions:
 229  *              "object" must be locked.
 230  */
 231 void
 232 vm_fault_cleanup(
 233         register vm_object_t    object,
 234         register vm_page_t      top_page)
 235 {
 236         vm_object_paging_end(object);
 237         vm_object_unlock(object);
 238
 239         if (top_page != VM_PAGE_NULL) {
 240                 object = top_page->object;
 241
 242                 vm_object_lock(object);
 243                 VM_PAGE_FREE(top_page);
 244                 vm_object_paging_end(object);
 245                 vm_object_unlock(object);
 246         }
 247 }
 248
 249 #if     MACH_CLUSTER_STATS
 250 #define MAXCLUSTERPAGES 16
 251 struct {
 252         unsigned long pages_in_cluster;
 253         unsigned long pages_at_higher_offsets;
 254         unsigned long pages_at_lower_offsets;
 255 } cluster_stats_in[MAXCLUSTERPAGES];
 256 #define CLUSTER_STAT(clause)    clause
 257 #define CLUSTER_STAT_HIGHER(x)  \
 258         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 259 #define CLUSTER_STAT_LOWER(x)   \
 260          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 261 #define CLUSTER_STAT_CLUSTER(x) \
 262         ((cluster_stats_in[(x)].pages_in_cluster)++)
 263 #else   /* MACH_CLUSTER_STATS */
 264 #define CLUSTER_STAT(clause)
 265 #endif  /* MACH_CLUSTER_STATS */
 266
 267 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 268
 269
 270 boolean_t       vm_page_deactivate_behind = TRUE;
 271 /*
 272  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 273  */
 274 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 275 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 276                                                                 /* we use it to size an array on the stack */
 277
 278 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 279
 280 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 281
 282 /*
 283  * vm_page_is_sequential
 284  *
 285  * Determine if sequential access is in progress
 286  * in accordance with the behavior specified.
 287  * Update state to indicate current access pattern.
 288  *
 289  * object must have at least the shared lock held
 290  */
 291 static
 292 void
 293 vm_fault_is_sequential(
 294         vm_object_t             object,
 295         vm_object_offset_t      offset,
 296         vm_behavior_t           behavior)
 297 {
 298         vm_object_offset_t      last_alloc;
 299         int                     sequential;
 300         int                     orig_sequential;
 301
 302         last_alloc = object->last_alloc;
 303         sequential = object->sequential;
 304         orig_sequential = sequential;
 305
 306         switch (behavior) {
 307         case VM_BEHAVIOR_RANDOM:
 308                 /*
 309                  * reset indicator of sequential behavior
 310                  */
 311                 sequential = 0;
 312                 break;
 313
 314         case VM_BEHAVIOR_SEQUENTIAL:
 315                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 316                         /*
 317                          * advance indicator of sequential behavior
 318                          */
 319                         if (sequential < MAX_SEQUENTIAL_RUN)
 320                                 sequential += PAGE_SIZE;
 321                 } else {
 322                         /*
 323                          * reset indicator of sequential behavior
 324                          */
 325                         sequential = 0;
 326                 }
 327                 break;
 328
 329         case VM_BEHAVIOR_RSEQNTL:
 330                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 331                         /*
 332                          * advance indicator of sequential behavior
 333                          */
 334                         if (sequential > -MAX_SEQUENTIAL_RUN)
 335                                 sequential -= PAGE_SIZE;
 336                 } else {
 337                         /*
 338                          * reset indicator of sequential behavior
 339                          */
 340                         sequential = 0;
 341                 }
 342                 break;
 343
 344         case VM_BEHAVIOR_DEFAULT:
 345         default:
 346                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 347                         /*
 348                          * advance indicator of sequential behavior
 349                          */
 350                         if (sequential < 0)
 351                                 sequential = 0;
 352                         if (sequential < MAX_SEQUENTIAL_RUN)
 353                                 sequential += PAGE_SIZE;
 354
 355                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 356                         /*
 357                          * advance indicator of sequential behavior
 358                          */
 359                         if (sequential > 0)
 360                                 sequential = 0;
 361                         if (sequential > -MAX_SEQUENTIAL_RUN)
 362                                 sequential -= PAGE_SIZE;
 363                 } else {
 364                         /*
 365                          * reset indicator of sequential behavior
 366                          */
 367                         sequential = 0;
 368                 }
 369                 break;
 370         }
 371         if (sequential != orig_sequential) {
 372                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 373                         /*
 374                          * if someone else has already updated object->sequential
 375                          * don't bother trying to update it or object->last_alloc
 376                          */
 377                         return;
 378                 }
 379         }
 380         /*
 381          * I'd like to do this with a OSCompareAndSwap64, but that
 382          * doesn't exist for PPC...  however, it shouldn't matter
 383          * that much... last_alloc is maintained so that we can determine
 384          * if a sequential access pattern is taking place... if only
 385          * one thread is banging on this object, no problem with the unprotected
 386          * update... if 2 or more threads are banging away, we run the risk of
 387          * someone seeing a mangled update... however, in the face of multiple
 388          * accesses, no sequential access pattern can develop anyway, so we
 389          * haven't lost any real info.
 390          */
 391         object->last_alloc = offset;
 392 }
 393
 394
 395 int vm_page_deactivate_behind_count = 0;
 396
 397 /*
 398  * vm_page_deactivate_behind
 399  *
 400  * Determine if sequential access is in progress
 401  * in accordance with the behavior specified.  If
 402  * so, compute a potential page to deactivate and
 403  * deactivate it.
 404  *
 405  * object must be locked.
 406  *
 407  * return TRUE if we actually deactivate a page
 408  */
 409 static
 410 boolean_t
 411 vm_fault_deactivate_behind(
 412         vm_object_t             object,
 413         vm_object_offset_t      offset,
 414         vm_behavior_t           behavior)
 415 {
 416         int             n;
 417         int             pages_in_run = 0;
 418         int             max_pages_in_run = 0;
 419         int             sequential_run;
 420         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 421         vm_object_offset_t      run_offset = 0;
 422         vm_object_offset_t      pg_offset = 0;
 423         vm_page_t       m;
 424         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 425
 426         pages_in_run = 0;
 427 #if TRACEFAULTPAGE
 428         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 429 #endif
 430
 431         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 432                 /*
 433                  * Do not deactivate pages from the kernel object: they
 434                  * are not intended to become pageable.
 435                  * or we've disabled the deactivate behind mechanism
 436                  */
 437                 return FALSE;
 438         }
 439         if ((sequential_run = object->sequential)) {
 440                   if (sequential_run < 0) {
 441                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 442                           sequential_run = 0 - sequential_run;
 443                   } else {
 444                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 445                   }
 446         }
 447         switch (behavior) {
 448         case VM_BEHAVIOR_RANDOM:
 449                 break;
 450         case VM_BEHAVIOR_SEQUENTIAL:
 451                 if (sequential_run >= (int)PAGE_SIZE) {
 452                         run_offset = 0 - PAGE_SIZE_64;
 453                         max_pages_in_run = 1;
 454                 }
 455                 break;
 456         case VM_BEHAVIOR_RSEQNTL:
 457                 if (sequential_run >= (int)PAGE_SIZE) {
 458                         run_offset = PAGE_SIZE_64;
 459                         max_pages_in_run = 1;
 460                 }
 461                 break;
 462         case VM_BEHAVIOR_DEFAULT:
 463         default:
 464         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 465
 466                 /*
 467                  * determine if the run of sequential accesss has been
 468                  * long enough on an object with default access behavior
 469                  * to consider it for deactivation
 470                  */
 471                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 472                         /*
 473                          * the comparisons between offset and behind are done
 474                          * in this kind of odd fashion in order to prevent wrap around
 475                          * at the end points
 476                          */
 477                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 478                                 if (offset >= behind) {
 479                                         run_offset = 0 - behind;
 480                                         pg_offset = PAGE_SIZE_64;
 481                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 482                                 }
 483                         } else {
 484                                 if (offset < -behind) {
 485                                         run_offset = behind;
 486                                         pg_offset = 0 - PAGE_SIZE_64;
 487                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 488                                 }
 489                         }
 490                 }
 491                 break;
 492         }
 493         }
 494         for (n = 0; n < max_pages_in_run; n++) {
 495                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 496
 497                 if (m && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
 498                         page_run[pages_in_run++] = m;
 499                         pmap_clear_reference(m->phys_page);
 500                 }
 501         }
 502         if (pages_in_run) {
 503                 vm_page_lockspin_queues();
 504
 505                 for (n = 0; n < pages_in_run; n++) {
 506
 507                         m = page_run[n];
 508
 509                         vm_page_deactivate_internal(m, FALSE);
 510
 511                         vm_page_deactivate_behind_count++;
 512 #if TRACEFAULTPAGE
 513                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 514 #endif
 515                 }
 516                 vm_page_unlock_queues();
 517
 518                 return TRUE;
 519         }
 520         return FALSE;
 521 }
 522
 523
 524 static boolean_t
 525 vm_page_throttled(void)
 526 {
 527         clock_sec_t     elapsed_sec;
 528         clock_sec_t     tv_sec;
 529         clock_usec_t    tv_usec;
 530
 531         thread_t thread = current_thread();
 532
 533         if (thread->options & TH_OPT_VMPRIV)
 534                 return (FALSE);
 535
 536         thread->t_page_creation_count++;
 537
 538         if (NEED_TO_HARD_THROTTLE_THIS_TASK())
 539                 return (TRUE);
 540
 541         if (vm_page_free_count < vm_page_throttle_limit &&
 542             thread->t_page_creation_count > vm_page_creation_throttle) {
 543
 544                 clock_get_system_microtime(&tv_sec, &tv_usec);
 545
 546                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 547
 548                 if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
 549
 550                         if (elapsed_sec >= 60) {
 551                                 /*
 552                                  * we'll reset our stats to give a well behaved app
 553                                  * that was unlucky enough to accumulate a bunch of pages
 554                                  * over a long period of time a chance to get out of
 555                                  * the throttled state... we reset the counter and timestamp
 556                                  * so that if it stays under the rate limit for the next second
 557                                  * it will be back in our good graces... if it exceeds it, it
 558                                  * will remain in the throttled state
 559                                  */
 560                                 thread->t_page_creation_time = tv_sec;
 561                                 thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
 562                         }
 563                         ++vm_page_throttle_count;
 564
 565                         return (TRUE);
 566                 }
 567                 thread->t_page_creation_time = tv_sec;
 568                 thread->t_page_creation_count = 0;
 569         }
 570         return (FALSE);
 571 }
 572
 573
 574 /*
 575  * check for various conditions that would
 576  * prevent us from creating a ZF page...
 577  * cleanup is based on being called from vm_fault_page
 578  *
 579  * object must be locked
 580  * object == m->object
 581  */
 582 static vm_fault_return_t
 583 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
 584 {
 585         if (object->shadow_severed ||
 586             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 587                 /*
 588                  * Either:
 589                  * 1. the shadow chain was severed,
 590                  * 2. the purgeable object is volatile or empty and is marked
 591                  *    to fault on access while volatile.
 592                  * Just have to return an error at this point
 593                  */
 594                 if (m != VM_PAGE_NULL)
 595                         VM_PAGE_FREE(m);
 596                 vm_fault_cleanup(object, first_m);
 597
 598                 thread_interrupt_level(interruptible_state);
 599
 600                 return (VM_FAULT_MEMORY_ERROR);
 601         }
 602         if (vm_backing_store_low) {
 603                 /*
 604                  * are we protecting the system from
 605                  * backing store exhaustion.  If so
 606                  * sleep unless we are privileged.
 607                  */
 608                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 609
 610                         if (m != VM_PAGE_NULL)
 611                                 VM_PAGE_FREE(m);
 612                         vm_fault_cleanup(object, first_m);
 613
 614                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 615
 616                         thread_block(THREAD_CONTINUE_NULL);
 617                         thread_interrupt_level(interruptible_state);
 618
 619                         return (VM_FAULT_RETRY);
 620                 }
 621         }
 622         if (vm_page_throttled()) {
 623                 /*
 624                  * we're throttling zero-fills...
 625                  * treat this as if we couldn't grab a page
 626                  */
 627                 if (m != VM_PAGE_NULL)
 628                         VM_PAGE_FREE(m);
 629                 vm_fault_cleanup(object, first_m);
 630
 631                 if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
 632                         delay(HARD_THROTTLE_DELAY);
 633
 634                         if (current_thread_aborted()) {
 635                                 thread_interrupt_level(interruptible_state);
 636                                 return VM_FAULT_INTERRUPTED;
 637                         }
 638                 }
 639
 640                 thread_interrupt_level(interruptible_state);
 641
 642                 return (VM_FAULT_MEMORY_SHORTAGE);
 643         }
 644         return (VM_FAULT_SUCCESS);
 645 }
 646
 647
 648 /*
 649  * do the work to zero fill a page and
 650  * inject it into the correct paging queue
 651  *
 652  * m->object must be locked
 653  * page queue lock must NOT be held
 654  */
 655 static int
 656 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 657 {
 658         int my_fault = DBG_ZERO_FILL_FAULT;
 659
 660         /*
 661          * This is is a zero-fill page fault...
 662          *
 663          * Checking the page lock is a waste of
 664          * time;  this page was absent, so
 665          * it can't be page locked by a pager.
 666          *
 667          * we also consider it undefined
 668          * with respect to instruction
 669          * execution.  i.e. it is the responsibility
 670          * of higher layers to call for an instruction
 671          * sync after changing the contents and before
 672          * sending a program into this area.  We
 673          * choose this approach for performance
 674          */
 675         m->pmapped = TRUE;
 676
 677         m->cs_validated = FALSE;
 678         m->cs_tainted = FALSE;
 679
 680         if (no_zero_fill == TRUE)
 681                 my_fault = DBG_NZF_PAGE_FAULT;
 682         else {
 683                 vm_page_zero_fill(m);
 684
 685                 VM_STAT_INCR(zero_fill_count);
 686                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 687         }
 688         assert(!m->laundry);
 689         assert(m->object != kernel_object);
 690         //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
 691
 692         if (!IP_VALID(memory_manager_default) &&
 693                 (m->object->purgable == VM_PURGABLE_DENY ||
 694                  m->object->purgable == VM_PURGABLE_NONVOLATILE ||
 695                  m->object->purgable == VM_PURGABLE_VOLATILE )) {
 696                 vm_page_lockspin_queues();
 697
 698                 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 699                 m->throttled = TRUE;
 700                 vm_page_throttled_count++;
 701
 702                 vm_page_unlock_queues();
 703         } else {
 704                 if (current_thread()->t_page_creation_count > vm_page_creation_throttle) {
 705                         m->zero_fill = TRUE;
 706                         VM_ZF_COUNT_INCR();
 707                 }
 708         }
 709         return (my_fault);
 710 }
 711
 712
 713 /*
 714  *      Routine:        vm_fault_page
 715  *      Purpose:
 716  *              Find the resident page for the virtual memory
 717  *              specified by the given virtual memory object
 718  *              and offset.
 719  *      Additional arguments:
 720  *              The required permissions for the page is given
 721  *              in "fault_type".  Desired permissions are included
 722  *              in "protection".
 723  *              fault_info is passed along to determine pagein cluster
 724  *              limits... it contains the expected reference pattern,
 725  *              cluster size if available, etc...
 726  *
 727  *              If the desired page is known to be resident (for
 728  *              example, because it was previously wired down), asserting
 729  *              the "unwiring" parameter will speed the search.
 730  *
 731  *              If the operation can be interrupted (by thread_abort
 732  *              or thread_terminate), then the "interruptible"
 733  *              parameter should be asserted.
 734  *
 735  *      Results:
 736  *              The page containing the proper data is returned
 737  *              in "result_page".
 738  *
 739  *      In/out conditions:
 740  *              The source object must be locked and referenced,
 741  *              and must donate one paging reference.  The reference
 742  *              is not affected.  The paging reference and lock are
 743  *              consumed.
 744  *
 745  *              If the call succeeds, the object in which "result_page"
 746  *              resides is left locked and holding a paging reference.
 747  *              If this is not the original object, a busy page in the
 748  *              original object is returned in "top_page", to prevent other
 749  *              callers from pursuing this same data, along with a paging
 750  *              reference for the original object.  The "top_page" should
 751  *              be destroyed when this guarantee is no longer required.
 752  *              The "result_page" is also left busy.  It is not removed
 753  *              from the pageout queues.
 754  *      Special Case:
 755  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 756  *              fault succeeded but there's no VM page (i.e. the VM object
 757  *              does not actually hold VM pages, but device memory or
 758  *              large pages).  The object is still locked and we still hold a
 759  *              paging_in_progress reference.
 760  */
 761 unsigned int vm_fault_page_blocked_access = 0;
 762
 763 vm_fault_return_t
 764 vm_fault_page(
 765         /* Arguments: */
 766         vm_object_t     first_object,   /* Object to begin search */
 767         vm_object_offset_t first_offset,        /* Offset into object */
 768         vm_prot_t       fault_type,     /* What access is requested */
 769         boolean_t       must_be_resident,/* Must page be resident? */
 770         /* Modifies in place: */
 771         vm_prot_t       *protection,    /* Protection for mapping */
 772         /* Returns: */
 773         vm_page_t       *result_page,   /* Page found, if successful */
 774         vm_page_t       *top_page,      /* Page in top object, if
 775                                          * not result_page.  */
 776         int             *type_of_fault, /* if non-null, fill in with type of fault
 777                                          * COW, zero-fill, etc... returned in trace point */
 778         /* More arguments: */
 779         kern_return_t   *error_code,    /* code if page is in error */
 780         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 781 #if MACH_PAGEMAP
 782         boolean_t       data_supply,    /* treat as data_supply if
 783                                          * it is a write fault and a full
 784                                          * page is provided */
 785 #else
 786         __unused boolean_t data_supply,
 787 #endif
 788         vm_object_fault_info_t fault_info)
 789 {
 790         vm_page_t               m;
 791         vm_object_t             object;
 792         vm_object_offset_t      offset;
 793         vm_page_t               first_m;
 794         vm_object_t             next_object;
 795         vm_object_t             copy_object;
 796         boolean_t               look_for_page;
 797         vm_prot_t               access_required = fault_type;
 798         vm_prot_t               wants_copy_flag;
 799         CLUSTER_STAT(int pages_at_higher_offsets;)
 800         CLUSTER_STAT(int pages_at_lower_offsets;)
 801         kern_return_t           wait_result;
 802         boolean_t               interruptible_state;
 803         vm_fault_return_t       error;
 804         int                     my_fault;
 805         uint32_t                try_failed_count;
 806         int                     interruptible; /* how may fault be interrupted? */
 807         memory_object_t         pager;
 808         vm_fault_return_t       retval;
 809
 810 /*
 811  * MACH page map - an optional optimization where a bit map is maintained
 812  * by the VM subsystem for internal objects to indicate which pages of
 813  * the object currently reside on backing store.  This existence map
 814  * duplicates information maintained by the vnode pager.  It is
 815  * created at the time of the first pageout against the object, i.e.
 816  * at the same time pager for the object is created.  The optimization
 817  * is designed to eliminate pager interaction overhead, if it is
 818  * 'known' that the page does not exist on backing store.
 819  *
 820  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 821  * either marked as paged out in the existence map for the object or no
 822  * existence map exists for the object.  MUST_ASK_PAGER() is one of the
 823  * criteria in the decision to invoke the pager.   It is also used as one
 824  * of the criteria to terminate the scan for adjacent pages in a clustered
 825  * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
 826  * permanent objects.  Note also that if the pager for an internal object
 827  * has not been created, the pager is not invoked regardless of the value
 828  * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
 829  * for which a pager has been created.
 830  *
 831  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 832  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 833  * PAGED_OUT() is used to determine if a page has already been pushed
 834  * into a copy object in order to avoid a redundant page out operation.
 835  */
 836 #if MACH_PAGEMAP
 837 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 838                         != VM_EXTERNAL_STATE_ABSENT)
 839 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 840                         == VM_EXTERNAL_STATE_EXISTS)
 841 #else
 842 #define MUST_ASK_PAGER(o, f) (TRUE)
 843 #define PAGED_OUT(o, f) (FALSE)
 844 #endif
 845
 846 /*
 847  *      Recovery actions
 848  */
 849 #define RELEASE_PAGE(m)                                 \
 850         MACRO_BEGIN                                     \
 851         PAGE_WAKEUP_DONE(m);                            \
 852         if (!m->active && !m->inactive && !m->throttled) {              \
 853                 vm_page_lockspin_queues();                              \
 854                 if (!m->active && !m->inactive && !m->throttled)        \
 855                         vm_page_activate(m);                            \
 856                 vm_page_unlock_queues();                                \
 857         }                                                               \
 858         MACRO_END
 859
 860 #if TRACEFAULTPAGE
 861         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 862 #endif
 863
 864
 865 #if     MACH_KDB
 866                 /*
 867                  *      If there are watchpoints set, then
 868                  *      we don't want to give away write permission
 869                  *      on a read fault.  Make the task write fault,
 870                  *      so that the watchpoint code notices the access.
 871                  */
 872             if (db_watchpoint_list) {
 873                 /*
 874                  *      If we aren't asking for write permission,
 875                  *      then don't give it away.  We're using write
 876                  *      faults to set the dirty bit.
 877                  */
 878                 if (!(fault_type & VM_PROT_WRITE))
 879                         *protection &= ~VM_PROT_WRITE;
 880         }
 881 #endif  /* MACH_KDB */
 882
 883         interruptible = fault_info->interruptible;
 884         interruptible_state = thread_interrupt_level(interruptible);
 885
 886         /*
 887          *      INVARIANTS (through entire routine):
 888          *
 889          *      1)      At all times, we must either have the object
 890          *              lock or a busy page in some object to prevent
 891          *              some other thread from trying to bring in
 892          *              the same page.
 893          *
 894          *              Note that we cannot hold any locks during the
 895          *              pager access or when waiting for memory, so
 896          *              we use a busy page then.
 897          *
 898          *      2)      To prevent another thread from racing us down the
 899          *              shadow chain and entering a new page in the top
 900          *              object before we do, we must keep a busy page in
 901          *              the top object while following the shadow chain.
 902          *
 903          *      3)      We must increment paging_in_progress on any object
 904          *              for which we have a busy page before dropping
 905          *              the object lock
 906          *
 907          *      4)      We leave busy pages on the pageout queues.
 908          *              If the pageout daemon comes across a busy page,
 909          *              it will remove the page from the pageout queues.
 910          */
 911
 912         object = first_object;
 913         offset = first_offset;
 914         first_m = VM_PAGE_NULL;
 915         access_required = fault_type;
 916
 917
 918         XPR(XPR_VM_FAULT,
 919                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 920                 object, offset, fault_type, *protection, 0);
 921
 922         /*
 923          * default type of fault
 924          */
 925         my_fault = DBG_CACHE_HIT_FAULT;
 926
 927         while (TRUE) {
 928 #if TRACEFAULTPAGE
 929                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 930 #endif
 931                 if (!object->alive) {
 932                         /*
 933                          * object is no longer valid
 934                          * clean up and return error
 935                          */
 936                         vm_fault_cleanup(object, first_m);
 937                         thread_interrupt_level(interruptible_state);
 938
 939                         return (VM_FAULT_MEMORY_ERROR);
 940                 }
 941
 942                 if (!object->pager_created && object->phys_contiguous) {
 943                         /*
 944                          * A physically-contiguous object without a pager:
 945                          * must be a "large page" object.  We do not deal
 946                          * with VM pages for this object.
 947                          */
 948                         m = VM_PAGE_NULL;
 949                         goto phys_contig_object;
 950                 }
 951
 952                 if (object->blocked_access) {
 953                         /*
 954                          * Access to this VM object has been blocked.
 955                          * Replace our "paging_in_progress" reference with
 956                          * a "activity_in_progress" reference and wait for
 957                          * access to be unblocked.
 958                          */
 959                         vm_object_activity_begin(object);
 960                         vm_object_paging_end(object);
 961                         while (object->blocked_access) {
 962                                 vm_object_sleep(object,
 963                                                 VM_OBJECT_EVENT_UNBLOCKED,
 964                                                 THREAD_UNINT);
 965                         }
 966                         vm_fault_page_blocked_access++;
 967                         vm_object_paging_begin(object);
 968                         vm_object_activity_end(object);
 969                 }
 970
 971                 /*
 972                  * See whether the page at 'offset' is resident
 973                  */
 974                 m = vm_page_lookup(object, offset);
 975 #if TRACEFAULTPAGE
 976                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 977 #endif
 978                 if (m != VM_PAGE_NULL) {
 979
 980                         if (m->busy) {
 981                                 /*
 982                                  * The page is being brought in,
 983                                  * wait for it and then retry.
 984                                  *
 985                                  * A possible optimization: if the page
 986                                  * is known to be resident, we can ignore
 987                                  * pages that are absent (regardless of
 988                                  * whether they're busy).
 989                                  */
 990 #if TRACEFAULTPAGE
 991                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 992 #endif
 993                                 wait_result = PAGE_SLEEP(object, m, interruptible);
 994                                 XPR(XPR_VM_FAULT,
 995                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 996                                         object, offset,
 997                                         m, 0, 0);
 998                                 counter(c_vm_fault_page_block_busy_kernel++);
 999
1000                                 if (wait_result != THREAD_AWAKENED) {
1001                                         vm_fault_cleanup(object, first_m);
1002                                         thread_interrupt_level(interruptible_state);
1003
1004                                         if (wait_result == THREAD_RESTART)
1005                                                 return (VM_FAULT_RETRY);
1006                                         else
1007                                                 return (VM_FAULT_INTERRUPTED);
1008                                 }
1009                                 continue;
1010                         }
1011
1012                         if (m->phys_page == vm_page_guard_addr) {
1013                                 /*
1014                                  * Guard page: off limits !
1015                                  */
1016                                 if (fault_type == VM_PROT_NONE) {
1017                                         /*
1018                                          * The fault is not requesting any
1019                                          * access to the guard page, so it must
1020                                          * be just to wire or unwire it.
1021                                          * Let's pretend it succeeded...
1022                                          */
1023                                         m->busy = TRUE;
1024                                         *result_page = m;
1025                                         assert(first_m == VM_PAGE_NULL);
1026                                         *top_page = first_m;
1027                                         if (type_of_fault)
1028                                                 *type_of_fault = DBG_GUARD_FAULT;
1029                                         return VM_FAULT_SUCCESS;
1030                                 } else {
1031                                         /*
1032                                          * The fault requests access to the
1033                                          * guard page: let's deny that !
1034                                          */
1035                                         vm_fault_cleanup(object, first_m);
1036                                         thread_interrupt_level(interruptible_state);
1037                                         return VM_FAULT_MEMORY_ERROR;
1038                                 }
1039                         }
1040
1041                         if (m->error) {
1042                                 /*
1043                                  * The page is in error, give up now.
1044                                  */
1045 #if TRACEFAULTPAGE
1046                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1047 #endif
1048                                 if (error_code)
1049                                         *error_code = KERN_MEMORY_ERROR;
1050                                 VM_PAGE_FREE(m);
1051
1052                                 vm_fault_cleanup(object, first_m);
1053                                 thread_interrupt_level(interruptible_state);
1054
1055                                 return (VM_FAULT_MEMORY_ERROR);
1056                         }
1057                         if (m->restart) {
1058                                 /*
1059                                  * The pager wants us to restart
1060                                  * at the top of the chain,
1061                                  * typically because it has moved the
1062                                  * page to another pager, then do so.
1063                                  */
1064 #if TRACEFAULTPAGE
1065                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1066 #endif
1067                                 VM_PAGE_FREE(m);
1068
1069                                 vm_fault_cleanup(object, first_m);
1070                                 thread_interrupt_level(interruptible_state);
1071
1072                                 return (VM_FAULT_RETRY);
1073                         }
1074                         if (m->absent) {
1075                                 /*
1076                                  * The page isn't busy, but is absent,
1077                                  * therefore it's deemed "unavailable".
1078                                  *
1079                                  * Remove the non-existent page (unless it's
1080                                  * in the top object) and move on down to the
1081                                  * next object (if there is one).
1082                                  */
1083 #if TRACEFAULTPAGE
1084                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1085 #endif
1086                                 next_object = object->shadow;
1087
1088                                 if (next_object == VM_OBJECT_NULL) {
1089                                         /*
1090                                          * Absent page at bottom of shadow
1091                                          * chain; zero fill the page we left
1092                                          * busy in the first object, and free
1093                                          * the absent page.
1094                                          */
1095                                         assert(!must_be_resident);
1096
1097                                         /*
1098                                          * check for any conditions that prevent
1099                                          * us from creating a new zero-fill page
1100                                          * vm_fault_check will do all of the
1101                                          * fault cleanup in the case of an error condition
1102                                          * including resetting the thread_interrupt_level
1103                                          */
1104                                         error = vm_fault_check(object, m, first_m, interruptible_state);
1105
1106                                         if (error != VM_FAULT_SUCCESS)
1107                                                 return (error);
1108
1109                                         XPR(XPR_VM_FAULT,
1110                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1111                                                 object, offset,
1112                                                 m,
1113                                                 first_object, 0);
1114
1115                                         if (object != first_object) {
1116                                                 /*
1117                                                  * free the absent page we just found
1118                                                  */
1119                                                 VM_PAGE_FREE(m);
1120
1121                                                 /*
1122                                                  * drop reference and lock on current object
1123                                                  */
1124                                                 vm_object_paging_end(object);
1125                                                 vm_object_unlock(object);
1126
1127                                                 /*
1128                                                  * grab the original page we
1129                                                  * 'soldered' in place and
1130                                                  * retake lock on 'first_object'
1131                                                  */
1132                                                 m = first_m;
1133                                                 first_m = VM_PAGE_NULL;
1134
1135                                                 object = first_object;
1136                                                 offset = first_offset;
1137
1138                                                 vm_object_lock(object);
1139                                         } else {
1140                                                 /*
1141                                                  * we're going to use the absent page we just found
1142                                                  * so convert it to a 'busy' page
1143                                                  */
1144                                                 m->absent = FALSE;
1145                                                 m->busy = TRUE;
1146                                         }
1147                                         /*
1148                                          * zero-fill the page and put it on
1149                                          * the correct paging queue
1150                                          */
1151                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1152
1153                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1154                                                 m->absent = TRUE;
1155                                         break;
1156                                 } else {
1157                                         if (must_be_resident)
1158                                                 vm_object_paging_end(object);
1159                                         else if (object != first_object) {
1160                                                 vm_object_paging_end(object);
1161                                                 VM_PAGE_FREE(m);
1162                                         } else {
1163                                                 first_m = m;
1164                                                 m->absent = FALSE;
1165                                                 m->busy = TRUE;
1166
1167                                                 vm_page_lockspin_queues();
1168                                                 VM_PAGE_QUEUES_REMOVE(m);
1169                                                 vm_page_unlock_queues();
1170                                         }
1171                                         XPR(XPR_VM_FAULT,
1172                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1173                                                 object, offset,
1174                                                 next_object,
1175                                                 offset+object->shadow_offset,0);
1176
1177                                         offset += object->shadow_offset;
1178                                         fault_info->lo_offset += object->shadow_offset;
1179                                         fault_info->hi_offset += object->shadow_offset;
1180                                         access_required = VM_PROT_READ;
1181
1182                                         vm_object_lock(next_object);
1183                                         vm_object_unlock(object);
1184                                         object = next_object;
1185                                         vm_object_paging_begin(object);
1186
1187                                         /*
1188                                          * reset to default type of fault
1189                                          */
1190                                         my_fault = DBG_CACHE_HIT_FAULT;
1191
1192                                         continue;
1193                                 }
1194                         }
1195                         if ((m->cleaning)
1196                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1197                             && (fault_type & VM_PROT_WRITE)) {
1198                                 /*
1199                                  * This is a copy-on-write fault that will
1200                                  * cause us to revoke access to this page, but
1201                                  * this page is in the process of being cleaned
1202                                  * in a clustered pageout. We must wait until
1203                                  * the cleaning operation completes before
1204                                  * revoking access to the original page,
1205                                  * otherwise we might attempt to remove a
1206                                  * wired mapping.
1207                                  */
1208 #if TRACEFAULTPAGE
1209                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1210 #endif
1211                                 XPR(XPR_VM_FAULT,
1212                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1213                                         object, offset,
1214                                         m, 0, 0);
1215                                 /*
1216                                  * take an extra ref so that object won't die
1217                                  */
1218                                 vm_object_reference_locked(object);
1219
1220                                 vm_fault_cleanup(object, first_m);
1221
1222                                 counter(c_vm_fault_page_block_backoff_kernel++);
1223                                 vm_object_lock(object);
1224                                 assert(object->ref_count > 0);
1225
1226                                 m = vm_page_lookup(object, offset);
1227
1228                                 if (m != VM_PAGE_NULL && m->cleaning) {
1229                                         PAGE_ASSERT_WAIT(m, interruptible);
1230
1231                                         vm_object_unlock(object);
1232                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1233                                         vm_object_deallocate(object);
1234
1235                                         goto backoff;
1236                                 } else {
1237                                         vm_object_unlock(object);
1238
1239                                         vm_object_deallocate(object);
1240                                         thread_interrupt_level(interruptible_state);
1241
1242                                         return (VM_FAULT_RETRY);
1243                                 }
1244                         }
1245                         if (type_of_fault == NULL && m->speculative &&
1246                             !(fault_info != NULL && fault_info->stealth)) {
1247                                 /*
1248                                  * If we were passed a non-NULL pointer for
1249                                  * "type_of_fault", than we came from
1250                                  * vm_fault... we'll let it deal with
1251                                  * this condition, since it
1252                                  * needs to see m->speculative to correctly
1253                                  * account the pageins, otherwise...
1254                                  * take it off the speculative queue, we'll
1255                                  * let the caller of vm_fault_page deal
1256                                  * with getting it onto the correct queue
1257                                  *
1258                                  * If the caller specified in fault_info that
1259                                  * it wants a "stealth" fault, we also leave
1260                                  * the page in the speculative queue.
1261                                  */
1262                                 vm_page_lockspin_queues();
1263                                 VM_PAGE_QUEUES_REMOVE(m);
1264                                 vm_page_unlock_queues();
1265                         }
1266
1267                         if (m->encrypted) {
1268                                 /*
1269                                  * ENCRYPTED SWAP:
1270                                  * the user needs access to a page that we
1271                                  * encrypted before paging it out.
1272                                  * Decrypt the page now.
1273                                  * Keep it busy to prevent anyone from
1274                                  * accessing it during the decryption.
1275                                  */
1276                                 m->busy = TRUE;
1277                                 vm_page_decrypt(m, 0);
1278                                 assert(object == m->object);
1279                                 assert(m->busy);
1280                                 PAGE_WAKEUP_DONE(m);
1281
1282                                 /*
1283                                  * Retry from the top, in case
1284                                  * something changed while we were
1285                                  * decrypting.
1286                                  */
1287                                 continue;
1288                         }
1289                         ASSERT_PAGE_DECRYPTED(m);
1290
1291                         if (m->object->code_signed) {
1292                                 /*
1293                                  * CODE SIGNING:
1294                                  * We just paged in a page from a signed
1295                                  * memory object but we don't need to
1296                                  * validate it now.  We'll validate it if
1297                                  * when it gets mapped into a user address
1298                                  * space for the first time or when the page
1299                                  * gets copied to another object as a result
1300                                  * of a copy-on-write.
1301                                  */
1302                         }
1303
1304                         /*
1305                          * We mark the page busy and leave it on
1306                          * the pageout queues.  If the pageout
1307                          * deamon comes across it, then it will
1308                          * remove the page from the queue, but not the object
1309                          */
1310 #if TRACEFAULTPAGE
1311                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1312 #endif
1313                         XPR(XPR_VM_FAULT,
1314                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1315                                 object, offset, m, 0, 0);
1316                         assert(!m->busy);
1317                         assert(!m->absent);
1318
1319                         m->busy = TRUE;
1320                         break;
1321                 }
1322
1323
1324                 /*
1325                  * we get here when there is no page present in the object at
1326                  * the offset we're interested in... we'll allocate a page
1327                  * at this point if the pager associated with
1328                  * this object can provide the data or we're the top object...
1329                  * object is locked;  m == NULL
1330                  */
1331                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1332
1333 #if TRACEFAULTPAGE
1334                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1335 #endif
1336                 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) {
1337                         /*
1338                          * Allocate a new page for this object/offset pair
1339                          */
1340                         m = vm_page_grab();
1341 #if TRACEFAULTPAGE
1342                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1343 #endif
1344                         if (m == VM_PAGE_NULL) {
1345
1346                                 vm_fault_cleanup(object, first_m);
1347                                 thread_interrupt_level(interruptible_state);
1348
1349                                 return (VM_FAULT_MEMORY_SHORTAGE);
1350                         }
1351                         vm_page_insert(m, object, offset);
1352                 }
1353                 if (look_for_page && !must_be_resident) {
1354                         kern_return_t   rc;
1355
1356                         /*
1357                          *      If the memory manager is not ready, we
1358                          *      cannot make requests.
1359                          */
1360                         if (!object->pager_ready) {
1361 #if TRACEFAULTPAGE
1362                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1363 #endif
1364                                 if (m != VM_PAGE_NULL)
1365                                         VM_PAGE_FREE(m);
1366
1367                                 XPR(XPR_VM_FAULT,
1368                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1369                                         object, offset, 0, 0, 0);
1370
1371                                 /*
1372                                  * take an extra ref so object won't die
1373                                  */
1374                                 vm_object_reference_locked(object);
1375                                 vm_fault_cleanup(object, first_m);
1376                                 counter(c_vm_fault_page_block_backoff_kernel++);
1377
1378                                 vm_object_lock(object);
1379                                 assert(object->ref_count > 0);
1380
1381                                 if (!object->pager_ready) {
1382                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1383
1384                                         vm_object_unlock(object);
1385                                         if (wait_result == THREAD_WAITING)
1386                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1387                                         vm_object_deallocate(object);
1388
1389                                         goto backoff;
1390                                 } else {
1391                                         vm_object_unlock(object);
1392                                         vm_object_deallocate(object);
1393                                         thread_interrupt_level(interruptible_state);
1394
1395                                         return (VM_FAULT_RETRY);
1396                                 }
1397                         }
1398                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1399                                 /*
1400                                  * If there are too many outstanding page
1401                                  * requests pending on this external object, we
1402                                  * wait for them to be resolved now.
1403                                  */
1404 #if TRACEFAULTPAGE
1405                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1406 #endif
1407                                 if (m != VM_PAGE_NULL)
1408                                         VM_PAGE_FREE(m);
1409                                 /*
1410                                  * take an extra ref so object won't die
1411                                  */
1412                                 vm_object_reference_locked(object);
1413
1414                                 vm_fault_cleanup(object, first_m);
1415
1416                                 counter(c_vm_fault_page_block_backoff_kernel++);
1417
1418                                 vm_object_lock(object);
1419                                 assert(object->ref_count > 0);
1420
1421                                 if (object->paging_in_progress > vm_object_pagein_throttle) {
1422                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible);
1423
1424                                         vm_object_unlock(object);
1425                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1426                                         vm_object_deallocate(object);
1427
1428                                         goto backoff;
1429                                 } else {
1430                                         vm_object_unlock(object);
1431                                         vm_object_deallocate(object);
1432                                         thread_interrupt_level(interruptible_state);
1433
1434                                         return (VM_FAULT_RETRY);
1435                                 }
1436                         }
1437                         if (m != VM_PAGE_NULL) {
1438                                 /*
1439                                  * Indicate that the page is waiting for data
1440                                  * from the memory manager.
1441                                  */
1442                                 m->list_req_pending = TRUE;
1443                                 m->absent = TRUE;
1444                         }
1445
1446 #if TRACEFAULTPAGE
1447                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1448 #endif
1449
1450                         /*
1451                          * It's possible someone called vm_object_destroy while we weren't
1452                          * holding the object lock.  If that has happened, then bail out
1453                          * here.
1454                          */
1455
1456                         pager = object->pager;
1457
1458                         if (pager == MEMORY_OBJECT_NULL) {
1459                                 vm_fault_cleanup(object, first_m);
1460                                 thread_interrupt_level(interruptible_state);
1461                                 return VM_FAULT_MEMORY_ERROR;
1462                         }
1463
1464                         /*
1465                          * We have an absent page in place for the faulting offset,
1466                          * so we can release the object lock.
1467                          */
1468
1469                         vm_object_unlock(object);
1470
1471                         /*
1472                          * If this object uses a copy_call strategy,
1473                          * and we are interested in a copy of this object
1474                          * (having gotten here only by following a
1475                          * shadow chain), then tell the memory manager
1476                          * via a flag added to the desired_access
1477                          * parameter, so that it can detect a race
1478                          * between our walking down the shadow chain
1479                          * and its pushing pages up into a copy of
1480                          * the object that it manages.
1481                          */
1482                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1483                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1484                         else
1485                                 wants_copy_flag = VM_PROT_NONE;
1486
1487                         XPR(XPR_VM_FAULT,
1488                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1489                                 object, offset, m,
1490                                 access_required | wants_copy_flag, 0);
1491
1492                         /*
1493                          * Call the memory manager to retrieve the data.
1494                          */
1495                         rc = memory_object_data_request(
1496                                 pager,
1497                                 offset + object->paging_offset,
1498                                 PAGE_SIZE,
1499                                 access_required | wants_copy_flag,
1500                                 (memory_object_fault_info_t)fault_info);
1501
1502 #if TRACEFAULTPAGE
1503                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1504 #endif
1505                         vm_object_lock(object);
1506
1507                         if (rc != KERN_SUCCESS) {
1508
1509                                 vm_fault_cleanup(object, first_m);
1510                                 thread_interrupt_level(interruptible_state);
1511
1512                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1513                                         VM_FAULT_INTERRUPTED :
1514                                         VM_FAULT_MEMORY_ERROR);
1515                         } else {
1516                                 clock_sec_t     tv_sec;
1517                                 clock_usec_t    tv_usec;
1518
1519                                 clock_get_system_microtime(&tv_sec, &tv_usec);
1520                                 current_thread()->t_page_creation_time = tv_sec;
1521                                 current_thread()->t_page_creation_count = 0;
1522                         }
1523                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) {
1524
1525                                 vm_fault_cleanup(object, first_m);
1526                                 thread_interrupt_level(interruptible_state);
1527
1528                                 return (VM_FAULT_INTERRUPTED);
1529                         }
1530                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1531                                 /*
1532                                  * No page here means that the object we
1533                                  * initially looked up was "physically
1534                                  * contiguous" (i.e. device memory).  However,
1535                                  * with Virtual VRAM, the object might not
1536                                  * be backed by that device memory anymore,
1537                                  * so we're done here only if the object is
1538                                  * still "phys_contiguous".
1539                                  * Otherwise, if the object is no longer
1540                                  * "phys_contiguous", we need to retry the
1541                                  * page fault against the object's new backing
1542                                  * store (different memory object).
1543                                  */
1544                         phys_contig_object:
1545                                 goto done;
1546                         }
1547                         /*
1548                          * potentially a pagein fault
1549                          * if we make it through the state checks
1550                          * above, than we'll count it as such
1551                          */
1552                         my_fault = DBG_PAGEIN_FAULT;
1553
1554                         /*
1555                          * Retry with same object/offset, since new data may
1556                          * be in a different page (i.e., m is meaningless at
1557                          * this point).
1558                          */
1559                         continue;
1560                 }
1561
1562                 /*
1563                  * We get here if the object has no pager, or an existence map
1564                  * exists and indicates the page isn't present on the pager
1565                  * or we're unwiring a page.  If a pager exists, but there
1566                  * is no existence map, then the m->absent case above handles
1567                  * the ZF case when the pager can't provide the page
1568                  */
1569 #if TRACEFAULTPAGE
1570                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1571 #endif
1572                 if (object == first_object)
1573                         first_m = m;
1574                 else
1575                         assert(m == VM_PAGE_NULL);
1576
1577                 XPR(XPR_VM_FAULT,
1578                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1579                         object, offset, m,
1580                         object->shadow, 0);
1581
1582                 next_object = object->shadow;
1583
1584                 if (next_object == VM_OBJECT_NULL) {
1585                         /*
1586                          * we've hit the bottom of the shadown chain,
1587                          * fill the page in the top object with zeros.
1588                          */
1589                         assert(!must_be_resident);
1590
1591                         if (object != first_object) {
1592                                 vm_object_paging_end(object);
1593                                 vm_object_unlock(object);
1594
1595                                 object = first_object;
1596                                 offset = first_offset;
1597                                 vm_object_lock(object);
1598                         }
1599                         m = first_m;
1600                         assert(m->object == object);
1601                         first_m = VM_PAGE_NULL;
1602
1603                         /*
1604                          * check for any conditions that prevent
1605                          * us from creating a new zero-fill page
1606                          * vm_fault_check will do all of the
1607                          * fault cleanup in the case of an error condition
1608                          * including resetting the thread_interrupt_level
1609                          */
1610                         error = vm_fault_check(object, m, first_m, interruptible_state);
1611
1612                         if (error != VM_FAULT_SUCCESS)
1613                                 return (error);
1614
1615                         if (m == VM_PAGE_NULL) {
1616                                 m = vm_page_grab();
1617
1618                                 if (m == VM_PAGE_NULL) {
1619                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1620                                         thread_interrupt_level(interruptible_state);
1621
1622                                         return (VM_FAULT_MEMORY_SHORTAGE);
1623                                 }
1624                                 vm_page_insert(m, object, offset);
1625                         }
1626                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1627
1628                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1629                                 m->absent = TRUE;
1630                         break;
1631
1632                 } else {
1633                         /*
1634                          * Move on to the next object.  Lock the next
1635                          * object before unlocking the current one.
1636                          */
1637                         if ((object != first_object) || must_be_resident)
1638                                 vm_object_paging_end(object);
1639
1640                         offset += object->shadow_offset;
1641                         fault_info->lo_offset += object->shadow_offset;
1642                         fault_info->hi_offset += object->shadow_offset;
1643                         access_required = VM_PROT_READ;
1644
1645                         vm_object_lock(next_object);
1646                         vm_object_unlock(object);
1647
1648                         object = next_object;
1649                         vm_object_paging_begin(object);
1650                 }
1651         }
1652
1653         /*
1654          *      PAGE HAS BEEN FOUND.
1655          *
1656          *      This page (m) is:
1657          *              busy, so that we can play with it;
1658          *              not absent, so that nobody else will fill it;
1659          *              possibly eligible for pageout;
1660          *
1661          *      The top-level page (first_m) is:
1662          *              VM_PAGE_NULL if the page was found in the
1663          *               top-level object;
1664          *              busy, not absent, and ineligible for pageout.
1665          *
1666          *      The current object (object) is locked.  A paging
1667          *      reference is held for the current and top-level
1668          *      objects.
1669          */
1670
1671 #if TRACEFAULTPAGE
1672         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1673 #endif
1674 #if     EXTRA_ASSERTIONS
1675         assert(m->busy && !m->absent);
1676         assert((first_m == VM_PAGE_NULL) ||
1677                (first_m->busy && !first_m->absent &&
1678                 !first_m->active && !first_m->inactive));
1679 #endif  /* EXTRA_ASSERTIONS */
1680
1681         /*
1682          * ENCRYPTED SWAP:
1683          * If we found a page, we must have decrypted it before we
1684          * get here...
1685          */
1686         ASSERT_PAGE_DECRYPTED(m);
1687
1688         XPR(XPR_VM_FAULT,
1689             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1690                 object, offset, m,
1691                 first_object, first_m);
1692
1693         /*
1694          * If the page is being written, but isn't
1695          * already owned by the top-level object,
1696          * we have to copy it into a new page owned
1697          * by the top-level object.
1698          */
1699         if (object != first_object) {
1700
1701 #if TRACEFAULTPAGE
1702                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1703 #endif
1704                 if (fault_type & VM_PROT_WRITE) {
1705                         vm_page_t copy_m;
1706
1707                         /*
1708                          * We only really need to copy if we
1709                          * want to write it.
1710                          */
1711                         assert(!must_be_resident);
1712
1713                         /*
1714                          * are we protecting the system from
1715                          * backing store exhaustion.  If so
1716                          * sleep unless we are privileged.
1717                          */
1718                         if (vm_backing_store_low) {
1719                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1720
1721                                         RELEASE_PAGE(m);
1722                                         vm_fault_cleanup(object, first_m);
1723
1724                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1725
1726                                         thread_block(THREAD_CONTINUE_NULL);
1727                                         thread_interrupt_level(interruptible_state);
1728
1729                                         return (VM_FAULT_RETRY);
1730                                 }
1731                         }
1732                         /*
1733                          * If we try to collapse first_object at this
1734                          * point, we may deadlock when we try to get
1735                          * the lock on an intermediate object (since we
1736                          * have the bottom object locked).  We can't
1737                          * unlock the bottom object, because the page
1738                          * we found may move (by collapse) if we do.
1739                          *
1740                          * Instead, we first copy the page.  Then, when
1741                          * we have no more use for the bottom object,
1742                          * we unlock it and try to collapse.
1743                          *
1744                          * Note that we copy the page even if we didn't
1745                          * need to... that's the breaks.
1746                          */
1747
1748                         /*
1749                          * Allocate a page for the copy
1750                          */
1751                         copy_m = vm_page_grab();
1752
1753                         if (copy_m == VM_PAGE_NULL) {
1754                                 RELEASE_PAGE(m);
1755
1756                                 vm_fault_cleanup(object, first_m);
1757                                 thread_interrupt_level(interruptible_state);
1758
1759                                 return (VM_FAULT_MEMORY_SHORTAGE);
1760                         }
1761                         XPR(XPR_VM_FAULT,
1762                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1763                                 object, offset,
1764                                 m, copy_m, 0);
1765
1766                         vm_page_copy(m, copy_m);
1767
1768                         /*
1769                          * If another map is truly sharing this
1770                          * page with us, we have to flush all
1771                          * uses of the original page, since we
1772                          * can't distinguish those which want the
1773                          * original from those which need the
1774                          * new copy.
1775                          *
1776                          * XXXO If we know that only one map has
1777                          * access to this page, then we could
1778                          * avoid the pmap_disconnect() call.
1779                          */
1780                         if (m->pmapped)
1781                                 pmap_disconnect(m->phys_page);
1782
1783                         assert(!m->cleaning);
1784
1785                         /*
1786                          * We no longer need the old page or object.
1787                          */
1788                         PAGE_WAKEUP_DONE(m);
1789                         vm_object_paging_end(object);
1790                         vm_object_unlock(object);
1791
1792                         my_fault = DBG_COW_FAULT;
1793                         VM_STAT_INCR(cow_faults);
1794                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1795                         current_task()->cow_faults++;
1796
1797                         object = first_object;
1798                         offset = first_offset;
1799
1800                         vm_object_lock(object);
1801                         /*
1802                          * get rid of the place holder
1803                          * page that we soldered in earlier
1804                          */
1805                         VM_PAGE_FREE(first_m);
1806                         first_m = VM_PAGE_NULL;
1807
1808                         /*
1809                          * and replace it with the
1810                          * page we just copied into
1811                          */
1812                         assert(copy_m->busy);
1813                         vm_page_insert(copy_m, object, offset);
1814                         copy_m->dirty = TRUE;
1815
1816                         m = copy_m;
1817                         /*
1818                          * Now that we've gotten the copy out of the
1819                          * way, let's try to collapse the top object.
1820                          * But we have to play ugly games with
1821                          * paging_in_progress to do that...
1822                          */
1823                         vm_object_paging_end(object);
1824                         vm_object_collapse(object, offset, TRUE);
1825                         vm_object_paging_begin(object);
1826
1827                 } else
1828                         *protection &= (~VM_PROT_WRITE);
1829         }
1830         /*
1831          * Now check whether the page needs to be pushed into the
1832          * copy object.  The use of asymmetric copy on write for
1833          * shared temporary objects means that we may do two copies to
1834          * satisfy the fault; one above to get the page from a
1835          * shadowed object, and one here to push it into the copy.
1836          */
1837         try_failed_count = 0;
1838
1839         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
1840                 vm_object_offset_t      copy_offset;
1841                 vm_page_t               copy_m;
1842
1843 #if TRACEFAULTPAGE
1844                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1845 #endif
1846                 /*
1847                  * If the page is being written, but hasn't been
1848                  * copied to the copy-object, we have to copy it there.
1849                  */
1850                 if ((fault_type & VM_PROT_WRITE) == 0) {
1851                         *protection &= ~VM_PROT_WRITE;
1852                         break;
1853                 }
1854
1855                 /*
1856                  * If the page was guaranteed to be resident,
1857                  * we must have already performed the copy.
1858                  */
1859                 if (must_be_resident)
1860                         break;
1861
1862                 /*
1863                  * Try to get the lock on the copy_object.
1864                  */
1865                 if (!vm_object_lock_try(copy_object)) {
1866
1867                         vm_object_unlock(object);
1868                         try_failed_count++;
1869
1870                         mutex_pause(try_failed_count);  /* wait a bit */
1871                         vm_object_lock(object);
1872
1873                         continue;
1874                 }
1875                 try_failed_count = 0;
1876
1877                 /*
1878                  * Make another reference to the copy-object,
1879                  * to keep it from disappearing during the
1880                  * copy.
1881                  */
1882                 vm_object_reference_locked(copy_object);
1883
1884                 /*
1885                  * Does the page exist in the copy?
1886                  */
1887                 copy_offset = first_offset - copy_object->shadow_offset;
1888
1889                 if (copy_object->size <= copy_offset)
1890                         /*
1891                          * Copy object doesn't cover this page -- do nothing.
1892                          */
1893                         ;
1894                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1895                         /*
1896                          * Page currently exists in the copy object
1897                          */
1898                         if (copy_m->busy) {
1899                                 /*
1900                                  * If the page is being brought
1901                                  * in, wait for it and then retry.
1902                                  */
1903                                 RELEASE_PAGE(m);
1904
1905                                 /*
1906                                  * take an extra ref so object won't die
1907                                  */
1908                                 vm_object_reference_locked(copy_object);
1909                                 vm_object_unlock(copy_object);
1910                                 vm_fault_cleanup(object, first_m);
1911                                 counter(c_vm_fault_page_block_backoff_kernel++);
1912
1913                                 vm_object_lock(copy_object);
1914                                 assert(copy_object->ref_count > 0);
1915                                 VM_OBJ_RES_DECR(copy_object);
1916                                 vm_object_lock_assert_exclusive(copy_object);
1917                                 copy_object->ref_count--;
1918                                 assert(copy_object->ref_count > 0);
1919                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1920                                 /*
1921                                  * ENCRYPTED SWAP:
1922                                  * it's OK if the "copy_m" page is encrypted,
1923                                  * because we're not moving it nor handling its
1924                                  * contents.
1925                                  */
1926                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1927                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1928
1929                                         vm_object_unlock(copy_object);
1930                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1931                                         vm_object_deallocate(copy_object);
1932
1933                                         goto backoff;
1934                                 } else {
1935                                         vm_object_unlock(copy_object);
1936                                         vm_object_deallocate(copy_object);
1937                                         thread_interrupt_level(interruptible_state);
1938
1939                                         return (VM_FAULT_RETRY);
1940                                 }
1941                         }
1942                 }
1943                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1944                         /*
1945                          * If PAGED_OUT is TRUE, then the page used to exist
1946                          * in the copy-object, and has already been paged out.
1947                          * We don't need to repeat this. If PAGED_OUT is
1948                          * FALSE, then either we don't know (!pager_created,
1949                          * for example) or it hasn't been paged out.
1950                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1951                          * We must copy the page to the copy object.
1952                          */
1953
1954                         if (vm_backing_store_low) {
1955                                 /*
1956                                  * we are protecting the system from
1957                                  * backing store exhaustion.  If so
1958                                  * sleep unless we are privileged.
1959                                  */
1960                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1961                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1962
1963                                         RELEASE_PAGE(m);
1964                                         VM_OBJ_RES_DECR(copy_object);
1965                                         vm_object_lock_assert_exclusive(copy_object);
1966                                         copy_object->ref_count--;
1967                                         assert(copy_object->ref_count > 0);
1968
1969                                         vm_object_unlock(copy_object);
1970                                         vm_fault_cleanup(object, first_m);
1971                                         thread_block(THREAD_CONTINUE_NULL);
1972                                         thread_interrupt_level(interruptible_state);
1973
1974                                         return (VM_FAULT_RETRY);
1975                                 }
1976                         }
1977                         /*
1978                          * Allocate a page for the copy
1979                          */
1980                         copy_m = vm_page_alloc(copy_object, copy_offset);
1981
1982                         if (copy_m == VM_PAGE_NULL) {
1983                                 RELEASE_PAGE(m);
1984
1985                                 VM_OBJ_RES_DECR(copy_object);
1986                                 vm_object_lock_assert_exclusive(copy_object);
1987                                 copy_object->ref_count--;
1988                                 assert(copy_object->ref_count > 0);
1989
1990                                 vm_object_unlock(copy_object);
1991                                 vm_fault_cleanup(object, first_m);
1992                                 thread_interrupt_level(interruptible_state);
1993
1994                                 return (VM_FAULT_MEMORY_SHORTAGE);
1995                         }
1996                         /*
1997                          * Must copy page into copy-object.
1998                          */
1999                         vm_page_copy(m, copy_m);
2000
2001                         /*
2002                          * If the old page was in use by any users
2003                          * of the copy-object, it must be removed
2004                          * from all pmaps.  (We can't know which
2005                          * pmaps use it.)
2006                          */
2007                         if (m->pmapped)
2008                                 pmap_disconnect(m->phys_page);
2009
2010                         /*
2011                          * If there's a pager, then immediately
2012                          * page out this page, using the "initialize"
2013                          * option.  Else, we use the copy.
2014                          */
2015                         if ((!copy_object->pager_created)
2016 #if MACH_PAGEMAP
2017                             || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2018 #endif
2019                             ) {
2020
2021                                 vm_page_lockspin_queues();
2022                                 assert(!m->cleaning);
2023                                 vm_page_activate(copy_m);
2024                                 vm_page_unlock_queues();
2025
2026                                 copy_m->dirty = TRUE;
2027                                 PAGE_WAKEUP_DONE(copy_m);
2028                         }
2029                         else {
2030                                 assert(copy_m->busy == TRUE);
2031                                 assert(!m->cleaning);
2032
2033                                 /*
2034                                  * dirty is protected by the object lock
2035                                  */
2036                                 copy_m->dirty = TRUE;
2037
2038                                 /*
2039                                  * The page is already ready for pageout:
2040                                  * not on pageout queues and busy.
2041                                  * Unlock everything except the
2042                                  * copy_object itself.
2043                                  */
2044                                 vm_object_unlock(object);
2045
2046                                 /*
2047                                  * Write the page to the copy-object,
2048                                  * flushing it from the kernel.
2049                                  */
2050                                 vm_pageout_initialize_page(copy_m);
2051
2052                                 /*
2053                                  * Since the pageout may have
2054                                  * temporarily dropped the
2055                                  * copy_object's lock, we
2056                                  * check whether we'll have
2057                                  * to deallocate the hard way.
2058                                  */
2059                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2060                                         vm_object_unlock(copy_object);
2061                                         vm_object_deallocate(copy_object);
2062                                         vm_object_lock(object);
2063
2064                                         continue;
2065                                 }
2066                                 /*
2067                                  * Pick back up the old object's
2068                                  * lock.  [It is safe to do so,
2069                                  * since it must be deeper in the
2070                                  * object tree.]
2071                                  */
2072                                 vm_object_lock(object);
2073                         }
2074                         /*
2075                          * Because we're pushing a page upward
2076                          * in the object tree, we must restart
2077                          * any faults that are waiting here.
2078                          * [Note that this is an expansion of
2079                          * PAGE_WAKEUP that uses the THREAD_RESTART
2080                          * wait result].  Can't turn off the page's
2081                          * busy bit because we're not done with it.
2082                          */
2083                         if (m->wanted) {
2084                                 m->wanted = FALSE;
2085                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2086                         }
2087                 }
2088                 /*
2089                  * The reference count on copy_object must be
2090                  * at least 2: one for our extra reference,
2091                  * and at least one from the outside world
2092                  * (we checked that when we last locked
2093                  * copy_object).
2094                  */
2095                 vm_object_lock_assert_exclusive(copy_object);
2096                 copy_object->ref_count--;
2097                 assert(copy_object->ref_count > 0);
2098
2099                 VM_OBJ_RES_DECR(copy_object);
2100                 vm_object_unlock(copy_object);
2101
2102                 break;
2103         }
2104
2105 done:
2106         *result_page = m;
2107         *top_page = first_m;
2108
2109         XPR(XPR_VM_FAULT,
2110                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2111                 object, offset, m, first_m, 0);
2112
2113         if (m != VM_PAGE_NULL) {
2114                 retval = VM_FAULT_SUCCESS;
2115                 if (my_fault == DBG_PAGEIN_FAULT) {
2116
2117                         VM_STAT_INCR(pageins);
2118                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2119                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
2120                         current_task()->pageins++;
2121
2122                         if (m->object->internal) {
2123                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2124                                 my_fault = DBG_PAGEIND_FAULT;
2125                         } else {
2126                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2127                                 my_fault = DBG_PAGEINV_FAULT;
2128                         }
2129
2130                         /*
2131                          * evaluate access pattern and update state
2132                          * vm_fault_deactivate_behind depends on the
2133                          * state being up to date
2134                          */
2135                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2136
2137                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2138                 }
2139                 if (type_of_fault)
2140                         *type_of_fault = my_fault;
2141         } else {
2142                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2143                 assert(first_m == VM_PAGE_NULL);
2144                 assert(object == first_object);
2145         }
2146
2147         thread_interrupt_level(interruptible_state);
2148
2149 #if TRACEFAULTPAGE
2150         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2151 #endif
2152         return retval;
2153
2154 backoff:
2155         thread_interrupt_level(interruptible_state);
2156
2157         if (wait_result == THREAD_INTERRUPTED)
2158                 return (VM_FAULT_INTERRUPTED);
2159         return (VM_FAULT_RETRY);
2160
2161 #undef  RELEASE_PAGE
2162 }
2163
2164
2165
2166 /*
2167  * CODE SIGNING:
2168  * When soft faulting a page, we have to validate the page if:
2169  * 1. the page is being mapped in user space
2170  * 2. the page hasn't already been found to be "tainted"
2171  * 3. the page belongs to a code-signed object
2172  * 4. the page has not been validated yet or has been mapped for write.
2173  */
2174 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page)                         \
2175         ((pmap) != kernel_pmap /*1*/ &&                                 \
2176          !(page)->cs_tainted /*2*/ &&                                   \
2177          (page)->object->code_signed /*3*/ &&                           \
2178          (!(page)->cs_validated || (page)->wpmapped /*4*/))
2179
2180
2181 /*
2182  * page queue lock must NOT be held
2183  * m->object must be locked
2184  *
2185  * NOTE: m->object could be locked "shared" only if we are called
2186  * from vm_fault() as part of a soft fault.  If so, we must be
2187  * careful not to modify the VM object in any way that is not
2188  * legal under a shared lock...
2189  */
2190 unsigned long cs_enter_tainted_rejected = 0;
2191 unsigned long cs_enter_tainted_accepted = 0;
2192 kern_return_t
2193 vm_fault_enter(vm_page_t m,
2194                pmap_t pmap,
2195                vm_map_offset_t vaddr,
2196                vm_prot_t prot,
2197                boolean_t wired,
2198                boolean_t change_wiring,
2199                boolean_t no_cache,
2200                int *type_of_fault)
2201 {
2202         unsigned int    cache_attr;
2203         kern_return_t   kr, pe_result;
2204         boolean_t       previously_pmapped = m->pmapped;
2205         boolean_t       must_disconnect = 0;
2206         boolean_t       map_is_switched, map_is_switch_protected;
2207
2208         vm_object_lock_assert_held(m->object);
2209 #if DEBUG
2210         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2211 #endif /* DEBUG */
2212
2213         if (m->phys_page == vm_page_guard_addr) {
2214                 assert(m->fictitious);
2215                 return KERN_SUCCESS;
2216         }
2217
2218         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2219
2220         if (m->pmapped == FALSE) {
2221                 /*
2222                  * This is the first time this page is being
2223                  * mapped in an address space (pmapped == FALSE).
2224                  *
2225                  * Part of that page may still be in the data cache
2226                  * and not flushed to memory.  In case we end up
2227                  * accessing that page via the instruction cache,
2228                  * we need to ensure that the 2 caches are in sync.
2229                  */
2230                 pmap_sync_page_data_phys(m->phys_page);
2231
2232                 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2233                         /*
2234                          * found it in the cache, but this
2235                          * is the first fault-in of the page (m->pmapped == FALSE)
2236                          * so it must have come in as part of
2237                          * a cluster... account 1 pagein against it
2238                          */
2239                         VM_STAT_INCR(pageins);
2240                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2241
2242                         if (m->object->internal) {
2243                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2244                                 *type_of_fault = DBG_PAGEIND_FAULT;
2245                         } else {
2246                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2247                                 *type_of_fault = DBG_PAGEINV_FAULT;
2248                         }
2249
2250                         current_task()->pageins++;
2251                 }
2252                 VM_PAGE_CONSUME_CLUSTERED(m);
2253
2254         } else if (cache_attr != VM_WIMG_DEFAULT)
2255                 pmap_sync_page_attributes_phys(m->phys_page);
2256
2257         if (*type_of_fault != DBG_COW_FAULT) {
2258                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2259
2260                 if (pmap == kernel_pmap) {
2261                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2262                 }
2263         }
2264
2265         /* Validate code signature if necessary. */
2266         if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2267                 vm_object_lock_assert_exclusive(m->object);
2268
2269                 if (m->cs_validated) {
2270                         vm_cs_revalidates++;
2271                 }
2272
2273                 /* VM map is locked, so 1 ref will remain on VM object -
2274                  * so no harm if vm_page_validate_cs drops the object lock */
2275                 vm_page_validate_cs(m);
2276         }
2277
2278 #define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2279
2280         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2281                            (pmap == vm_map_pmap(current_thread()->map)));
2282         map_is_switch_protected = current_thread()->map->switch_protect;
2283
2284         /* If the map is switched, and is switch-protected, we must protect
2285          * some pages from being write-faulted: immutable pages because by
2286          * definition they may not be written, and executable pages because that
2287          * would provide a way to inject unsigned code.
2288          * If the page is immutable, we can simply return. However, we can't
2289          * immediately determine whether a page is executable anywhere. But,
2290          * we can disconnect it everywhere and remove the executable protection
2291          * from the current map. We do that below right before we do the
2292          * PMAP_ENTER.
2293          */
2294         if(!cs_enforcement_disable && map_is_switched &&
2295            map_is_switch_protected && page_immutable(m, prot) &&
2296            (prot & VM_PROT_WRITE))
2297         {
2298                 return KERN_CODESIGN_ERROR;
2299         }
2300
2301         /* A page could be tainted, or pose a risk of being tainted later.
2302          * Check whether the receiving process wants it, and make it feel
2303          * the consequences (that hapens in cs_invalid_page()).
2304          * For CS Enforcement, two other conditions will
2305          * cause that page to be tainted as well:
2306          * - pmapping an unsigned page executable - this means unsigned code;
2307          * - writeable mapping of a validated page - the content of that page
2308          *   can be changed without the kernel noticing, therefore unsigned
2309          *   code can be created
2310          */
2311         if (m->cs_tainted ||
2312             ( !cs_enforcement_disable &&
2313              (/* The page is unsigned and wants to be executable */
2314               (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2315               /* The page should be immutable, but is in danger of being modified
2316                 * This is the case where we want policy from the code directory -
2317                 * is the page immutable or not? For now we have to assume that
2318                 * code pages will be immutable, data pages not.
2319                 * We'll assume a page is a code page if it has a code directory
2320                 * and we fault for execution.
2321                 * That is good enough since if we faulted the code page for
2322                 * writing in another map before, it is wpmapped; if we fault
2323                 * it for writing in this map later it will also be faulted for executing
2324                 * at the same time; and if we fault for writing in another map
2325                 * later, we will disconnect it from this pmap so we'll notice
2326                 * the change.
2327                 */
2328               (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2329               ))
2330                 )
2331         {
2332                 /* We will have a tainted page. Have to handle the special case
2333                  * of a switched map now. If the map is not switched, standard
2334                  * procedure applies - call cs_invalid_page().
2335                  * If the map is switched, the real owner is invalid already.
2336                  * There is no point in invalidating the switching process since
2337                  * it will not be executing from the map. So we don't call
2338                  * cs_invalid_page() in that case. */
2339                 boolean_t reject_page;
2340                 if(map_is_switched) {
2341                         assert(pmap==vm_map_pmap(current_thread()->map));
2342                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2343                         reject_page = FALSE;
2344                 } else {
2345                         reject_page = cs_invalid_page((addr64_t) vaddr);
2346                 }
2347
2348                 if (reject_page) {
2349                         /* reject the tainted page: abort the page fault */
2350                         kr = KERN_CODESIGN_ERROR;
2351                         cs_enter_tainted_rejected++;
2352                 } else {
2353                         /* proceed with the tainted page */
2354                         kr = KERN_SUCCESS;
2355                         /* Page might have been tainted before or not; now it
2356                          * definitively is. If the page wasn't tainted, we must
2357                          * disconnect it from all pmaps later. */
2358                         must_disconnect = !m->cs_tainted;
2359                         m->cs_tainted = TRUE;
2360                         cs_enter_tainted_accepted++;
2361                 }
2362                 if (cs_debug || kr != KERN_SUCCESS) {
2363                         printf("CODESIGNING: vm_fault_enter(0x%llx): "
2364                                "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2365                                (long long)vaddr, m, m->object, m->offset);
2366                 }
2367
2368         } else {
2369                 /* proceed with the valid page */
2370                 kr = KERN_SUCCESS;
2371         }
2372
2373         /* If we have a KERN_SUCCESS from the previous checks, we either have
2374          * a good page, or a tainted page that has been accepted by the process.
2375          * In both cases the page will be entered into the pmap.
2376          * If the page is writeable, we need to disconnect it from other pmaps
2377          * now so those processes can take note.
2378          */
2379         if (kr == KERN_SUCCESS) {
2380                 /*
2381                  * NOTE: we may only hold the vm_object lock SHARED
2382                  * at this point, but the update of pmapped is ok
2383                  * since this is the ONLY bit updated behind the SHARED
2384                  * lock... however, we need to figure out how to do an atomic
2385                  * update on a bit field to make this less fragile... right
2386                  * now I don't know how to coerce 'C' to give me the offset info
2387                  * that's needed for an AtomicCompareAndSwap
2388                  */
2389                 m->pmapped = TRUE;
2390                 if (prot & VM_PROT_WRITE) {
2391                         vm_object_lock_assert_exclusive(m->object);
2392                         m->wpmapped = TRUE;
2393                         if(must_disconnect) {
2394                                 /* We can only get here
2395                                  * because of the CSE logic */
2396                                 assert(cs_enforcement_disable == FALSE);
2397                                 pmap_disconnect(m->phys_page);
2398                                 /* If we are faulting for a write, we can clear
2399                                  * the execute bit - that will ensure the page is
2400                                  * checked again before being executable, which
2401                                  * protects against a map switch.
2402                                  * This only happens the first time the page
2403                                  * gets tainted, so we won't get stuck here
2404                                  * to make an already writeable page executable. */
2405                                 prot &= ~VM_PROT_EXECUTE;
2406                         }
2407                 }
2408
2409                 /* Prevent a deadlock by not
2410                  * holding the object lock if we need to wait for a page in
2411                  * pmap_enter() - <rdar://problem/7138958> */
2412                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, cache_attr,
2413                                   wired, PMAP_OPTIONS_NOWAIT, pe_result);
2414
2415                 if(pe_result == KERN_RESOURCE_SHORTAGE) {
2416                         /* The nonblocking version of pmap_enter did not succeed.
2417                          * Use the blocking version instead. Requires marking
2418                          * the page busy and unlocking the object */
2419                         boolean_t was_busy = m->busy;
2420                         m->busy = TRUE;
2421                         vm_object_unlock(m->object);
2422
2423                         PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired);
2424
2425                         /* Take the object lock again. */
2426                         vm_object_lock(m->object);
2427
2428                         /* If the page was busy, someone else will wake it up.
2429                          * Otherwise, we have to do it now. */
2430                         assert(m->busy);
2431                         if(!was_busy) {
2432                                 PAGE_WAKEUP_DONE(m);
2433                         }
2434                         vm_pmap_enter_blocked++;
2435                 }
2436         }
2437
2438         /*
2439          * Hold queues lock to manipulate
2440          * the page queues.  Change wiring
2441          * case is obvious.
2442          */
2443         if (change_wiring) {
2444                 vm_page_lockspin_queues();
2445
2446                 if (wired) {
2447                         if (kr == KERN_SUCCESS) {
2448                                 vm_page_wire(m);
2449                         }
2450                 } else {
2451                         vm_page_unwire(m, TRUE);
2452                 }
2453                 vm_page_unlock_queues();
2454
2455         } else {
2456                 if (kr != KERN_SUCCESS) {
2457                         vm_page_lockspin_queues();
2458                         vm_page_deactivate(m);
2459                         vm_page_unlock_queues();
2460                 } else {
2461                         if (((!m->active && !m->inactive) || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) {
2462
2463                                 if ( vm_page_local_q && !no_cache && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2464                                         struct vpl      *lq;
2465                                         uint32_t        lid;
2466
2467                                         /*
2468                                          * we got a local queue to stuff this new page on...
2469                                          * its safe to manipulate local and local_id at this point
2470                                          * since we're behind an exclusive object lock and the
2471                                          * page is not on any global queue.
2472                                          *
2473                                          * we'll use the current cpu number to select the queue
2474                                          * note that we don't need to disable preemption... we're
2475                                          * going to behind the local queue's lock to do the real
2476                                          * work
2477                                          */
2478                                         lid = cpu_number();
2479
2480                                         lq = &vm_page_local_q[lid].vpl_un.vpl;
2481
2482                                         VPL_LOCK(&lq->vpl_lock);
2483
2484                                         queue_enter(&lq->vpl_queue, m, vm_page_t, pageq);
2485                                         m->local = TRUE;
2486                                         m->local_id = lid;
2487                                         lq->vpl_count++;
2488
2489                                         VPL_UNLOCK(&lq->vpl_lock);
2490
2491                                         if (lq->vpl_count > vm_page_local_q_soft_limit) {
2492                                                 /*
2493                                                  * we're beyond the soft limit for the local queue
2494                                                  * vm_page_reactivate_local will 'try' to take
2495                                                  * the global page queue lock... if it can't that's
2496                                                  * ok... we'll let the queue continue to grow up
2497                                                  * to the hard limit... at that point we'll wait
2498                                                  * for the lock... once we've got the lock, we'll
2499                                                  * transfer all of the pages from the local queue
2500                                                  * to the global active queue
2501                                                  */
2502                                                 vm_page_reactivate_local(lid, FALSE, FALSE);
2503                                         }
2504                                         return kr;
2505                                 }
2506
2507                                 vm_page_lockspin_queues();
2508                                 /*
2509                                  * test again now that we hold the page queue lock
2510                                  */
2511                                 if (((!m->active && !m->inactive) || no_cache) && !VM_PAGE_WIRED(m)) {
2512
2513                                         /*
2514                                          * If this is a no_cache mapping and the page has never been
2515                                          * mapped before or was previously a no_cache page, then we
2516                                          * want to leave pages in the speculative state so that they
2517                                          * can be readily recycled if free memory runs low.  Otherwise
2518                                          * the page is activated as normal.
2519                                          */
2520
2521                                         if (no_cache && (!previously_pmapped || m->no_cache)) {
2522                                                 m->no_cache = TRUE;
2523
2524                                                 if (m->active || m->inactive)
2525                                                         VM_PAGE_QUEUES_REMOVE(m);
2526
2527                                                 if (!m->speculative)
2528                                                         vm_page_speculate(m, TRUE);
2529
2530                                         } else if (!m->active && !m->inactive)
2531                                                 vm_page_activate(m);
2532
2533                                 }
2534
2535                                 vm_page_unlock_queues();
2536                         }
2537                 }
2538         }
2539         return kr;
2540 }
2541
2542
2543 /*
2544  *      Routine:        vm_fault
2545  *      Purpose:
2546  *              Handle page faults, including pseudo-faults
2547  *              used to change the wiring status of pages.
2548  *      Returns:
2549  *              Explicit continuations have been removed.
2550  *      Implementation:
2551  *              vm_fault and vm_fault_page save mucho state
2552  *              in the moral equivalent of a closure.  The state
2553  *              structure is allocated when first entering vm_fault
2554  *              and deallocated when leaving vm_fault.
2555  */
2556
2557 extern int _map_enter_debug;
2558
2559 unsigned long vm_fault_collapse_total = 0;
2560 unsigned long vm_fault_collapse_skipped = 0;
2561
2562 kern_return_t
2563 vm_fault(
2564         vm_map_t        map,
2565         vm_map_offset_t vaddr,
2566         vm_prot_t       fault_type,
2567         boolean_t       change_wiring,
2568         int             interruptible,
2569         pmap_t          caller_pmap,
2570         vm_map_offset_t caller_pmap_addr)
2571 {
2572         vm_map_version_t        version;        /* Map version for verificiation */
2573         boolean_t               wired;          /* Should mapping be wired down? */
2574         vm_object_t             object;         /* Top-level object */
2575         vm_object_offset_t      offset;         /* Top-level offset */
2576         vm_prot_t               prot;           /* Protection for mapping */
2577         vm_object_t             old_copy_object; /* Saved copy object */
2578         vm_page_t               result_page;    /* Result of vm_fault_page */
2579         vm_page_t               top_page;       /* Placeholder page */
2580         kern_return_t           kr;
2581
2582         vm_page_t               m;      /* Fast access to result_page */
2583         kern_return_t           error_code;
2584         vm_object_t             cur_object;
2585         vm_object_offset_t      cur_offset;
2586         vm_page_t               cur_m;
2587         vm_object_t             new_object;
2588         int                     type_of_fault;
2589         pmap_t                  pmap;
2590         boolean_t               interruptible_state;
2591         vm_map_t                real_map = map;
2592         vm_map_t                original_map = map;
2593         vm_prot_t               original_fault_type;
2594         struct vm_object_fault_info fault_info;
2595         boolean_t               need_collapse = FALSE;
2596         int                     object_lock_type = 0;
2597         int                     cur_object_lock_type;
2598         vm_object_t             top_object = VM_OBJECT_NULL;
2599
2600
2601         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2602                               (int)((uint64_t)vaddr >> 32),
2603                               (int)vaddr,
2604                               0,
2605                               0,
2606                               0);
2607
2608         if (get_preemption_level() != 0) {
2609                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2610                                       (int)((uint64_t)vaddr >> 32),
2611                                       (int)vaddr,
2612                                       KERN_FAILURE,
2613                                       0,
2614                                       0);
2615
2616                 return (KERN_FAILURE);
2617         }
2618
2619         interruptible_state = thread_interrupt_level(interruptible);
2620
2621         VM_STAT_INCR(faults);
2622         current_task()->faults++;
2623         original_fault_type = fault_type;
2624
2625         if (fault_type & VM_PROT_WRITE)
2626                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2627         else
2628                 object_lock_type = OBJECT_LOCK_SHARED;
2629
2630         cur_object_lock_type = OBJECT_LOCK_SHARED;
2631
2632 RetryFault:
2633         /*
2634          * assume we will hit a page in the cache
2635          * otherwise, explicitly override with
2636          * the real fault type once we determine it
2637          */
2638         type_of_fault = DBG_CACHE_HIT_FAULT;
2639
2640         /*
2641          *      Find the backing store object and offset into
2642          *      it to begin the search.
2643          */
2644         fault_type = original_fault_type;
2645         map = original_map;
2646         vm_map_lock_read(map);
2647
2648         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2649                                   object_lock_type, &version,
2650                                   &object, &offset, &prot, &wired,
2651                                   &fault_info,
2652                                   &real_map);
2653
2654         if (kr != KERN_SUCCESS) {
2655                 vm_map_unlock_read(map);
2656                 goto done;
2657         }
2658         pmap = real_map->pmap;
2659         fault_info.interruptible = interruptible;
2660         fault_info.stealth = FALSE;
2661         fault_info.mark_zf_absent = FALSE;
2662
2663         /*
2664          * If the page is wired, we must fault for the current protection
2665          * value, to avoid further faults.
2666          */
2667         if (wired) {
2668                 fault_type = prot | VM_PROT_WRITE;
2669                 /*
2670                  * since we're treating this fault as a 'write'
2671                  * we must hold the top object lock exclusively
2672                  */
2673                 if (object_lock_type == OBJECT_LOCK_SHARED) {
2674
2675                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2676
2677                         if (vm_object_lock_upgrade(object) == FALSE) {
2678                                 /*
2679                                  * couldn't upgrade, so explictly
2680                                  * take the lock exclusively
2681                                  */
2682                                 vm_object_lock(object);
2683                         }
2684                 }
2685         }
2686
2687 #if     VM_FAULT_CLASSIFY
2688         /*
2689          *      Temporary data gathering code
2690          */
2691         vm_fault_classify(object, offset, fault_type);
2692 #endif
2693         /*
2694          *      Fast fault code.  The basic idea is to do as much as
2695          *      possible while holding the map lock and object locks.
2696          *      Busy pages are not used until the object lock has to
2697          *      be dropped to do something (copy, zero fill, pmap enter).
2698          *      Similarly, paging references aren't acquired until that
2699          *      point, and object references aren't used.
2700          *
2701          *      If we can figure out what to do
2702          *      (zero fill, copy on write, pmap enter) while holding
2703          *      the locks, then it gets done.  Otherwise, we give up,
2704          *      and use the original fault path (which doesn't hold
2705          *      the map lock, and relies on busy pages).
2706          *      The give up cases include:
2707          *              - Have to talk to pager.
2708          *              - Page is busy, absent or in error.
2709          *              - Pager has locked out desired access.
2710          *              - Fault needs to be restarted.
2711          *              - Have to push page into copy object.
2712          *
2713          *      The code is an infinite loop that moves one level down
2714          *      the shadow chain each time.  cur_object and cur_offset
2715          *      refer to the current object being examined. object and offset
2716          *      are the original object from the map.  The loop is at the
2717          *      top level if and only if object and cur_object are the same.
2718          *
2719          *      Invariants:  Map lock is held throughout.  Lock is held on
2720          *              original object and cur_object (if different) when
2721          *              continuing or exiting loop.
2722          *
2723          */
2724
2725
2726         /*
2727          * If this page is to be inserted in a copy delay object
2728          * for writing, and if the object has a copy, then the
2729          * copy delay strategy is implemented in the slow fault page.
2730          */
2731         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2732             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2733                 goto handle_copy_delay;
2734
2735         cur_object = object;
2736         cur_offset = offset;
2737
2738         while (TRUE) {
2739                 if (!cur_object->pager_created &&
2740                     cur_object->phys_contiguous) /* superpage */
2741                         break;
2742
2743                 if (cur_object->blocked_access) {
2744                         /*
2745                          * Access to this VM object has been blocked.
2746                          * Let the slow path handle it.
2747                          */
2748                         break;
2749                 }
2750
2751                 m = vm_page_lookup(cur_object, cur_offset);
2752
2753                 if (m != VM_PAGE_NULL) {
2754                         if (m->busy) {
2755                                 wait_result_t   result;
2756
2757                                 /*
2758                                  * in order to do the PAGE_ASSERT_WAIT, we must
2759                                  * have object that 'm' belongs to locked exclusively
2760                                  */
2761                                 if (object != cur_object) {
2762                                         vm_object_unlock(object);
2763
2764                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2765
2766                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2767
2768                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2769                                                         /*
2770                                                          * couldn't upgrade so go do a full retry
2771                                                          * immediately since we've already dropped
2772                                                          * the top object lock associated with this page
2773                                                          * and the current one got dropped due to the
2774                                                          * failed upgrade... the state is no longer valid
2775                                                          */
2776                                                         vm_map_unlock_read(map);
2777                                                         if (real_map != map)
2778                                                                 vm_map_unlock(real_map);
2779
2780                                                         goto RetryFault;
2781                                                 }
2782                                         }
2783                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2784
2785                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2786
2787                                         if (vm_object_lock_upgrade(object) == FALSE) {
2788                                                 /*
2789                                                  * couldn't upgrade, so explictly take the lock
2790                                                  * exclusively and go relookup the page since we
2791                                                  * will have dropped the object lock and
2792                                                  * a different thread could have inserted
2793                                                  * a page at this offset
2794                                                  * no need for a full retry since we're
2795                                                  * at the top level of the object chain
2796                                                  */
2797                                                 vm_object_lock(object);
2798
2799                                                 continue;
2800                                         }
2801                                 }
2802                                 vm_map_unlock_read(map);
2803                                 if (real_map != map)
2804                                         vm_map_unlock(real_map);
2805
2806                                 result = PAGE_ASSERT_WAIT(m, interruptible);
2807
2808                                 vm_object_unlock(cur_object);
2809
2810                                 if (result == THREAD_WAITING) {
2811                                         result = thread_block(THREAD_CONTINUE_NULL);
2812
2813                                         counter(c_vm_fault_page_block_busy_kernel++);
2814                                 }
2815                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2816                                         goto RetryFault;
2817
2818                                 kr = KERN_ABORTED;
2819                                 goto done;
2820                         }
2821                         if (m->phys_page == vm_page_guard_addr) {
2822                                 /*
2823                                  * Guard page: let the slow path deal with it
2824                                  */
2825                                 break;
2826                         }
2827                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
2828                                 /*
2829                                  * Unusual case... let the slow path deal with it
2830                                  */
2831                                 break;
2832                         }
2833                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
2834                                 if (object != cur_object)
2835                                         vm_object_unlock(object);
2836                                 vm_map_unlock_read(map);
2837                                 if (real_map != map)
2838                                         vm_map_unlock(real_map);
2839                                 vm_object_unlock(cur_object);
2840                                 kr = KERN_MEMORY_ERROR;
2841                                 goto done;
2842                         }
2843
2844                         if (m->encrypted) {
2845                                 /*
2846                                  * ENCRYPTED SWAP:
2847                                  * We've soft-faulted (because it's not in the page
2848                                  * table) on an encrypted page.
2849                                  * Keep the page "busy" so that no one messes with
2850                                  * it during the decryption.
2851                                  * Release the extra locks we're holding, keep only
2852                                  * the page's VM object lock.
2853                                  *
2854                                  * in order to set 'busy' on 'm', we must
2855                                  * have object that 'm' belongs to locked exclusively
2856                                  */
2857                                 if (object != cur_object) {
2858                                         vm_object_unlock(object);
2859
2860                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2861
2862                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2863
2864                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2865                                                         /*
2866                                                          * couldn't upgrade so go do a full retry
2867                                                          * immediately since we've already dropped
2868                                                          * the top object lock associated with this page
2869                                                          * and the current one got dropped due to the
2870                                                          * failed upgrade... the state is no longer valid
2871                                                          */
2872                                                         vm_map_unlock_read(map);
2873                                                         if (real_map != map)
2874                                                                 vm_map_unlock(real_map);
2875
2876                                                         goto RetryFault;
2877                                                 }
2878                                         }
2879                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2880
2881                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2882
2883                                         if (vm_object_lock_upgrade(object) == FALSE) {
2884                                                 /*
2885                                                  * couldn't upgrade, so explictly take the lock
2886                                                  * exclusively and go relookup the page since we
2887                                                  * will have dropped the object lock and
2888                                                  * a different thread could have inserted
2889                                                  * a page at this offset
2890                                                  * no need for a full retry since we're
2891                                                  * at the top level of the object chain
2892                                                  */
2893                                                 vm_object_lock(object);
2894
2895                                                 continue;
2896                                         }
2897                                 }
2898                                 m->busy = TRUE;
2899
2900                                 vm_map_unlock_read(map);
2901                                 if (real_map != map)
2902                                         vm_map_unlock(real_map);
2903
2904                                 vm_page_decrypt(m, 0);
2905
2906                                 assert(m->busy);
2907                                 PAGE_WAKEUP_DONE(m);
2908
2909                                 vm_object_unlock(cur_object);
2910                                 /*
2911                                  * Retry from the top, in case anything
2912                                  * changed while we were decrypting...
2913                                  */
2914                                 goto RetryFault;
2915                         }
2916                         ASSERT_PAGE_DECRYPTED(m);
2917
2918                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
2919                                 /*
2920                                  * We might need to validate this page
2921                                  * against its code signature, so we
2922                                  * want to hold the VM object exclusively.
2923                                  */
2924                                 if (object != cur_object) {
2925                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2926                                                 vm_object_unlock(object);
2927                                                 vm_object_unlock(cur_object);
2928
2929                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2930
2931                                                 vm_map_unlock_read(map);
2932                                                 if (real_map != map)
2933                                                         vm_map_unlock(real_map);
2934
2935                                                 goto RetryFault;
2936                                         }
2937
2938                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2939
2940                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2941
2942                                         if (vm_object_lock_upgrade(object) == FALSE) {
2943                                                 /*
2944                                                  * couldn't upgrade, so explictly take the lock
2945                                                  * exclusively and go relookup the page since we
2946                                                  * will have dropped the object lock and
2947                                                  * a different thread could have inserted
2948                                                  * a page at this offset
2949                                                  * no need for a full retry since we're
2950                                                  * at the top level of the object chain
2951                                                  */
2952                                                 vm_object_lock(object);
2953
2954                                                 continue;
2955                                         }
2956                                 }
2957                         }
2958                         /*
2959                          *      Two cases of map in faults:
2960                          *          - At top level w/o copy object.
2961                          *          - Read fault anywhere.
2962                          *              --> must disallow write.
2963                          */
2964
2965                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
2966                                 if ((fault_type & VM_PROT_WRITE) == 0) {
2967                                         /*
2968                                          * This is not a "write" fault, so we
2969                                          * might not have taken the object lock
2970                                          * exclusively and we might not be able
2971                                          * to update the "wpmapped" bit in
2972                                          * vm_fault_enter().
2973                                          * Let's just grant read access to
2974                                          * the page for now and we'll
2975                                          * soft-fault again if we need write
2976                                          * access later...
2977                                          */
2978                                         prot &= ~VM_PROT_WRITE;
2979                                 }
2980                                 goto FastPmapEnter;
2981                         }
2982
2983                         if ((fault_type & VM_PROT_WRITE) == 0) {
2984
2985                                 prot &= ~VM_PROT_WRITE;
2986
2987                                 if (object != cur_object) {
2988                                         /*
2989                                          * We still need to hold the top object
2990                                          * lock here to prevent a race between
2991                                          * a read fault (taking only "shared"
2992                                          * locks) and a write fault (taking
2993                                          * an "exclusive" lock on the top
2994                                          * object.
2995                                          * Otherwise, as soon as we release the
2996                                          * top lock, the write fault could
2997                                          * proceed and actually complete before
2998                                          * the read fault, and the copied page's
2999                                          * translation could then be overwritten
3000                                          * by the read fault's translation for
3001                                          * the original page.
3002                                          *
3003                                          * Let's just record what the top object
3004                                          * is and we'll release it later.
3005                                          */
3006                                         top_object = object;
3007
3008                                         /*
3009                                          * switch to the object that has the new page
3010                                          */
3011                                         object = cur_object;
3012                                         object_lock_type = cur_object_lock_type;
3013                                 }
3014 FastPmapEnter:
3015                                 /*
3016                                  * prepare for the pmap_enter...
3017                                  * object and map are both locked
3018                                  * m contains valid data
3019                                  * object == m->object
3020                                  * cur_object == NULL or it's been unlocked
3021                                  * no paging references on either object or cur_object
3022                                  */
3023 #if     MACH_KDB
3024                                 if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0)
3025                                         prot &= ~VM_PROT_WRITE;
3026 #endif
3027                                 if (caller_pmap) {
3028                                         kr = vm_fault_enter(m,
3029                                                             caller_pmap,
3030                                                             caller_pmap_addr,
3031                                                             prot,
3032                                                             wired,
3033                                                             change_wiring,
3034                                                             fault_info.no_cache,
3035                                                             &type_of_fault);
3036                                 } else {
3037                                         kr = vm_fault_enter(m,
3038                                                             pmap,
3039                                                             vaddr,
3040                                                             prot,
3041                                                             wired,
3042                                                             change_wiring,
3043                                                             fault_info.no_cache,
3044                                                             &type_of_fault);
3045                                 }
3046
3047                                 if (top_object != VM_OBJECT_NULL) {
3048                                         /*
3049                                          * It's safe to drop the top object
3050                                          * now that we've done our
3051                                          * vm_fault_enter().  Any other fault
3052                                          * in progress for that virtual
3053                                          * address will either find our page
3054                                          * and translation or put in a new page
3055                                          * and translation.
3056                                          */
3057                                         vm_object_unlock(top_object);
3058                                         top_object = VM_OBJECT_NULL;
3059                                 }
3060
3061                                 if (need_collapse == TRUE)
3062                                         vm_object_collapse(object, offset, TRUE);
3063
3064                                 if (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT) {
3065                                         /*
3066                                          * evaluate access pattern and update state
3067                                          * vm_fault_deactivate_behind depends on the
3068                                          * state being up to date
3069                                          */
3070                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3071
3072                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3073                                 }
3074                                 /*
3075                                  * That's it, clean up and return.
3076                                  */
3077                                 if (m->busy)
3078                                         PAGE_WAKEUP_DONE(m);
3079
3080                                 vm_object_unlock(object);
3081
3082                                 vm_map_unlock_read(map);
3083                                 if (real_map != map)
3084                                         vm_map_unlock(real_map);
3085
3086                                 goto done;
3087                         }
3088                         /*
3089                          * COPY ON WRITE FAULT
3090                          */
3091                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3092
3093                         if (vm_page_throttled()) {
3094                                 /*
3095                                  * drop all of our locks...
3096                                  * wait until the free queue is
3097                                  * pumped back up and then
3098                                  * redrive the fault
3099                                  */
3100                                 if (object != cur_object)
3101                                         vm_object_unlock(cur_object);
3102                                 vm_object_unlock(object);
3103                                 vm_map_unlock_read(map);
3104                                 if (real_map != map)
3105                                         vm_map_unlock(real_map);
3106
3107                                 if (NEED_TO_HARD_THROTTLE_THIS_TASK())
3108                                         delay(HARD_THROTTLE_DELAY);
3109
3110                                 if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3111                                                  THREAD_UNINT :
3112                                                  THREAD_ABORTSAFE))
3113                                         goto RetryFault;
3114                                 kr = KERN_ABORTED;
3115                                 goto done;
3116                         }
3117                         /*
3118                          * If objects match, then
3119                          * object->copy must not be NULL (else control
3120                          * would be in previous code block), and we
3121                          * have a potential push into the copy object
3122                          * with which we can't cope with here.
3123                          */
3124                         if (cur_object == object) {
3125                                 /*
3126                                  * must take the slow path to
3127                                  * deal with the copy push
3128                                  */
3129                                 break;
3130                         }
3131                         /*
3132                          * This is now a shadow based copy on write
3133                          * fault -- it requires a copy up the shadow
3134                          * chain.
3135                          *
3136                          * Allocate a page in the original top level
3137                          * object. Give up if allocate fails.  Also
3138                          * need to remember current page, as it's the
3139                          * source of the copy.
3140                          *
3141                          * at this point we hold locks on both
3142                          * object and cur_object... no need to take
3143                          * paging refs or mark pages BUSY since
3144                          * we don't drop either object lock until
3145                          * the page has been copied and inserted
3146                          */
3147                         cur_m = m;
3148                         m = vm_page_grab();
3149
3150                         if (m == VM_PAGE_NULL) {
3151                                 /*
3152                                  * no free page currently available...
3153                                  * must take the slow path
3154                                  */
3155                                 break;
3156                         }
3157                         /*
3158                          * Now do the copy.  Mark the source page busy...
3159                          *
3160                          *      NOTE: This code holds the map lock across
3161                          *      the page copy.
3162                          */
3163                         vm_page_copy(cur_m, m);
3164                         vm_page_insert(m, object, offset);
3165                         m->dirty = TRUE;
3166
3167                         /*
3168                          * Now cope with the source page and object
3169                          */
3170                         if (object->ref_count > 1 && cur_m->pmapped)
3171                                 pmap_disconnect(cur_m->phys_page);
3172
3173                         need_collapse = TRUE;
3174
3175                         if (!cur_object->internal &&
3176                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3177                                 /*
3178                                  * The object from which we've just
3179                                  * copied a page is most probably backed
3180                                  * by a vnode.  We don't want to waste too
3181                                  * much time trying to collapse the VM objects
3182                                  * and create a bottleneck when several tasks
3183                                  * map the same file.
3184                                  */
3185                                 if (cur_object->copy == object) {
3186                                         /*
3187                                          * Shared mapping or no COW yet.
3188                                          * We can never collapse a copy
3189                                          * object into its backing object.
3190                                          */
3191                                         need_collapse = FALSE;
3192                                 } else if (cur_object->copy == object->shadow &&
3193                                            object->shadow->resident_page_count == 0) {
3194                                         /*
3195                                          * Shared mapping after a COW occurred.
3196                                          */
3197                                         need_collapse = FALSE;
3198                                 }
3199                         }
3200                         vm_object_unlock(cur_object);
3201
3202                         if (need_collapse == FALSE)
3203                                 vm_fault_collapse_skipped++;
3204                         vm_fault_collapse_total++;
3205
3206                         type_of_fault = DBG_COW_FAULT;
3207                         VM_STAT_INCR(cow_faults);
3208                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
3209                         current_task()->cow_faults++;
3210
3211                         goto FastPmapEnter;
3212
3213                 } else {
3214                         /*
3215                          * No page at cur_object, cur_offset... m == NULL
3216                          */
3217                         if (cur_object->pager_created) {
3218                                 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
3219                                         /*
3220                                          * May have to talk to a pager...
3221                                          * take the slow path.
3222                                          */
3223                                         break;
3224                                 }
3225                                 /*
3226                                  * existence map present and indicates
3227                                  * that the pager doesn't have this page
3228                                  */
3229                         }
3230                         if (cur_object->shadow == VM_OBJECT_NULL) {
3231                                 /*
3232                                  * Zero fill fault.  Page gets
3233                                  * inserted into the original object.
3234                                  */
3235                                 if (cur_object->shadow_severed ||
3236                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
3237                                 {
3238                                         if (object != cur_object)
3239                                                 vm_object_unlock(cur_object);
3240                                         vm_object_unlock(object);
3241
3242                                         vm_map_unlock_read(map);
3243                                         if (real_map != map)
3244                                                 vm_map_unlock(real_map);
3245
3246                                         kr = KERN_MEMORY_ERROR;
3247                                         goto done;
3248                                 }
3249                                 if (vm_page_throttled()) {
3250                                         /*
3251                                          * drop all of our locks...
3252                                          * wait until the free queue is
3253                                          * pumped back up and then
3254                                          * redrive the fault
3255                                          */
3256                                         if (object != cur_object)
3257                                                 vm_object_unlock(cur_object);
3258                                         vm_object_unlock(object);
3259                                         vm_map_unlock_read(map);
3260                                         if (real_map != map)
3261                                                 vm_map_unlock(real_map);
3262
3263                                         if (NEED_TO_HARD_THROTTLE_THIS_TASK())
3264                                                 delay(HARD_THROTTLE_DELAY);
3265
3266                                         if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3267                                                          THREAD_UNINT :
3268                                                          THREAD_ABORTSAFE))
3269                                                 goto RetryFault;
3270                                         kr = KERN_ABORTED;
3271                                         goto done;
3272                                 }
3273                                 if (vm_backing_store_low) {
3274                                         /*
3275                                          * we are protecting the system from
3276                                          * backing store exhaustion...
3277                                          * must take the slow path if we're
3278                                          * not privileged
3279                                          */
3280                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
3281                                                 break;
3282                                 }
3283                                 if (cur_object != object) {
3284                                         vm_object_unlock(cur_object);
3285
3286                                         cur_object = object;
3287                                 }
3288                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3289
3290                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3291
3292                                         if (vm_object_lock_upgrade(object) == FALSE) {
3293                                                 /*
3294                                                  * couldn't upgrade so do a full retry on the fault
3295                                                  * since we dropped the object lock which
3296                                                  * could allow another thread to insert
3297                                                  * a page at this offset
3298                                                  */
3299                                                 vm_map_unlock_read(map);
3300                                                 if (real_map != map)
3301                                                         vm_map_unlock(real_map);
3302
3303                                                 goto RetryFault;
3304                                         }
3305                                 }
3306                                 m = vm_page_alloc(object, offset);
3307
3308                                 if (m == VM_PAGE_NULL) {
3309                                         /*
3310                                          * no free page currently available...
3311                                          * must take the slow path
3312                                          */
3313                                         break;
3314                                 }
3315
3316                                 /*
3317                                  * Now zero fill page...
3318                                  * the page is probably going to
3319                                  * be written soon, so don't bother
3320                                  * to clear the modified bit
3321                                  *
3322                                  *   NOTE: This code holds the map
3323                                  *   lock across the zero fill.
3324                                  */
3325                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
3326
3327                                 goto FastPmapEnter;
3328                         }
3329                         /*
3330                          * On to the next level in the shadow chain
3331                          */
3332                         cur_offset += cur_object->shadow_offset;
3333                         new_object = cur_object->shadow;
3334
3335                         /*
3336                          * take the new_object's lock with the indicated state
3337                          */
3338                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
3339                                 vm_object_lock_shared(new_object);
3340                         else
3341                                 vm_object_lock(new_object);
3342
3343                         if (cur_object != object)
3344                                 vm_object_unlock(cur_object);
3345
3346                         cur_object = new_object;
3347
3348                         continue;
3349                 }
3350         }
3351         /*
3352          * Cleanup from fast fault failure.  Drop any object
3353          * lock other than original and drop map lock.
3354          */
3355         if (object != cur_object)
3356                 vm_object_unlock(cur_object);
3357
3358         /*
3359          * must own the object lock exclusively at this point
3360          */
3361         if (object_lock_type == OBJECT_LOCK_SHARED) {
3362                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3363
3364                 if (vm_object_lock_upgrade(object) == FALSE) {
3365                         /*
3366                          * couldn't upgrade, so explictly
3367                          * take the lock exclusively
3368                          * no need to retry the fault at this
3369                          * point since "vm_fault_page" will
3370                          * completely re-evaluate the state
3371                          */
3372                         vm_object_lock(object);
3373                 }
3374         }
3375
3376 handle_copy_delay:
3377         vm_map_unlock_read(map);
3378         if (real_map != map)
3379                 vm_map_unlock(real_map);
3380
3381         /*
3382          * Make a reference to this object to
3383          * prevent its disposal while we are messing with
3384          * it.  Once we have the reference, the map is free
3385          * to be diddled.  Since objects reference their
3386          * shadows (and copies), they will stay around as well.
3387          */
3388         vm_object_reference_locked(object);
3389         vm_object_paging_begin(object);
3390
3391         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
3392
3393         error_code = 0;
3394
3395         kr = vm_fault_page(object, offset, fault_type,
3396                            (change_wiring && !wired),
3397                            &prot, &result_page, &top_page,
3398                            &type_of_fault,
3399                            &error_code, map->no_zero_fill,
3400                            FALSE, &fault_info);
3401
3402         /*
3403          * if kr != VM_FAULT_SUCCESS, then the paging reference
3404          * has been dropped and the object unlocked... the ref_count
3405          * is still held
3406          *
3407          * if kr == VM_FAULT_SUCCESS, then the paging reference
3408          * is still held along with the ref_count on the original object
3409          *
3410          *      the object is returned locked with a paging reference
3411          *
3412          *      if top_page != NULL, then it's BUSY and the
3413          *      object it belongs to has a paging reference
3414          *      but is returned unlocked
3415          */
3416         if (kr != VM_FAULT_SUCCESS &&
3417             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
3418                 /*
3419                  * we didn't succeed, lose the object reference immediately.
3420                  */
3421                 vm_object_deallocate(object);
3422
3423                 /*
3424                  * See why we failed, and take corrective action.
3425                  */
3426                 switch (kr) {
3427                 case VM_FAULT_MEMORY_SHORTAGE:
3428                         if (vm_page_wait((change_wiring) ?
3429                                          THREAD_UNINT :
3430                                          THREAD_ABORTSAFE))
3431                                 goto RetryFault;
3432                         /*
3433                          * fall thru
3434                          */
3435                 case VM_FAULT_INTERRUPTED:
3436                         kr = KERN_ABORTED;
3437                         goto done;
3438                 case VM_FAULT_RETRY:
3439                         goto RetryFault;
3440                 case VM_FAULT_MEMORY_ERROR:
3441                         if (error_code)
3442                                 kr = error_code;
3443                         else
3444                                 kr = KERN_MEMORY_ERROR;
3445                         goto done;
3446                 default:
3447                         panic("vm_fault: unexpected error 0x%x from "
3448                               "vm_fault_page()\n", kr);
3449                 }
3450         }
3451         m = result_page;
3452
3453         if (m != VM_PAGE_NULL) {
3454                 assert((change_wiring && !wired) ?
3455                     (top_page == VM_PAGE_NULL) :
3456                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
3457         }
3458
3459         /*
3460          * What to do with the resulting page from vm_fault_page
3461          * if it doesn't get entered into the physical map:
3462          */
3463 #define RELEASE_PAGE(m)                                 \
3464         MACRO_BEGIN                                     \
3465         PAGE_WAKEUP_DONE(m);                            \
3466         if (!m->active && !m->inactive && !m->throttled) {              \
3467                 vm_page_lockspin_queues();                              \
3468                 if (!m->active && !m->inactive && !m->throttled)        \
3469                         vm_page_activate(m);                            \
3470                 vm_page_unlock_queues();                                \
3471         }                                                               \
3472         MACRO_END
3473
3474         /*
3475          * We must verify that the maps have not changed
3476          * since our last lookup.
3477          */
3478         if (m != VM_PAGE_NULL) {
3479                 old_copy_object = m->object->copy;
3480                 vm_object_unlock(m->object);
3481         } else {
3482                 old_copy_object = VM_OBJECT_NULL;
3483                 vm_object_unlock(object);
3484         }
3485
3486         /*
3487          * no object locks are held at this point
3488          */
3489         if ((map != original_map) || !vm_map_verify(map, &version)) {
3490                 vm_object_t             retry_object;
3491                 vm_object_offset_t      retry_offset;
3492                 vm_prot_t               retry_prot;
3493
3494                 /*
3495                  * To avoid trying to write_lock the map while another
3496                  * thread has it read_locked (in vm_map_pageable), we
3497                  * do not try for write permission.  If the page is
3498                  * still writable, we will get write permission.  If it
3499                  * is not, or has been marked needs_copy, we enter the
3500                  * mapping without write permission, and will merely
3501                  * take another fault.
3502                  */
3503                 map = original_map;
3504                 vm_map_lock_read(map);
3505
3506                 kr = vm_map_lookup_locked(&map, vaddr,
3507                                           fault_type & ~VM_PROT_WRITE,
3508                                           OBJECT_LOCK_EXCLUSIVE, &version,
3509                                           &retry_object, &retry_offset, &retry_prot,
3510                                           &wired,
3511                                           &fault_info,
3512                                           &real_map);
3513                 pmap = real_map->pmap;
3514
3515                 if (kr != KERN_SUCCESS) {
3516                         vm_map_unlock_read(map);
3517
3518                         if (m != VM_PAGE_NULL) {
3519                                 /*
3520                                  * retake the lock so that
3521                                  * we can drop the paging reference
3522                                  * in vm_fault_cleanup and do the
3523                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3524                                  */
3525                                 vm_object_lock(m->object);
3526
3527                                 RELEASE_PAGE(m);
3528
3529                                 vm_fault_cleanup(m->object, top_page);
3530                         } else {
3531                                 /*
3532                                  * retake the lock so that
3533                                  * we can drop the paging reference
3534                                  * in vm_fault_cleanup
3535                                  */
3536                                 vm_object_lock(object);
3537
3538                                 vm_fault_cleanup(object, top_page);
3539                         }
3540                         vm_object_deallocate(object);
3541
3542                         goto done;
3543                 }
3544                 vm_object_unlock(retry_object);
3545
3546                 if ((retry_object != object) || (retry_offset != offset)) {
3547
3548                         vm_map_unlock_read(map);
3549                         if (real_map != map)
3550                                 vm_map_unlock(real_map);
3551
3552                         if (m != VM_PAGE_NULL) {
3553                                 /*
3554                                  * retake the lock so that
3555                                  * we can drop the paging reference
3556                                  * in vm_fault_cleanup and do the
3557                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3558                                  */
3559                                 vm_object_lock(m->object);
3560
3561                                 RELEASE_PAGE(m);
3562
3563                                 vm_fault_cleanup(m->object, top_page);
3564                         } else {
3565                                 /*
3566                                  * retake the lock so that
3567                                  * we can drop the paging reference
3568                                  * in vm_fault_cleanup
3569                                  */
3570                                 vm_object_lock(object);
3571
3572                                 vm_fault_cleanup(object, top_page);
3573                         }
3574                         vm_object_deallocate(object);
3575
3576                         goto RetryFault;
3577                 }
3578                 /*
3579                  * Check whether the protection has changed or the object
3580                  * has been copied while we left the map unlocked.
3581                  */
3582                 prot &= retry_prot;
3583         }
3584         if (m != VM_PAGE_NULL) {
3585                 vm_object_lock(m->object);
3586
3587                 if (m->object->copy != old_copy_object) {
3588                         /*
3589                          * The copy object changed while the top-level object
3590                          * was unlocked, so take away write permission.
3591                          */
3592                         prot &= ~VM_PROT_WRITE;
3593                 }
3594         } else
3595                 vm_object_lock(object);
3596
3597         /*
3598          * If we want to wire down this page, but no longer have
3599          * adequate permissions, we must start all over.
3600          */
3601         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3602
3603                 vm_map_verify_done(map, &version);
3604                 if (real_map != map)
3605                         vm_map_unlock(real_map);
3606
3607                 if (m != VM_PAGE_NULL) {
3608                         RELEASE_PAGE(m);
3609
3610                         vm_fault_cleanup(m->object, top_page);
3611                 } else
3612                         vm_fault_cleanup(object, top_page);
3613
3614                 vm_object_deallocate(object);
3615
3616                 goto RetryFault;
3617         }
3618         if (m != VM_PAGE_NULL) {
3619                 /*
3620                  * Put this page into the physical map.
3621                  * We had to do the unlock above because pmap_enter
3622                  * may cause other faults.  The page may be on
3623                  * the pageout queues.  If the pageout daemon comes
3624                  * across the page, it will remove it from the queues.
3625                  */
3626                 if (caller_pmap) {
3627                         kr = vm_fault_enter(m,
3628                                             caller_pmap,
3629                                             caller_pmap_addr,
3630                                             prot,
3631                                             wired,
3632                                             change_wiring,
3633                                             fault_info.no_cache,
3634                                             &type_of_fault);
3635                 } else {
3636                         kr = vm_fault_enter(m,
3637                                             pmap,
3638                                             vaddr,
3639                                             prot,
3640                                             wired,
3641                                             change_wiring,
3642                                             fault_info.no_cache,
3643                                             &type_of_fault);
3644                 }
3645                 if (kr != KERN_SUCCESS) {
3646                         /* abort this page fault */
3647                         vm_map_verify_done(map, &version);
3648                         if (real_map != map)
3649                                 vm_map_unlock(real_map);
3650                         PAGE_WAKEUP_DONE(m);
3651                         vm_fault_cleanup(m->object, top_page);
3652                         vm_object_deallocate(object);
3653                         goto done;
3654                 }
3655         } else {
3656
3657                 vm_map_entry_t          entry;
3658                 vm_map_offset_t         laddr;
3659                 vm_map_offset_t         ldelta, hdelta;
3660
3661                 /*
3662                  * do a pmap block mapping from the physical address
3663                  * in the object
3664                  */
3665
3666 #ifdef ppc
3667                 /* While we do not worry about execution protection in   */
3668                 /* general, certian pages may have instruction execution */
3669                 /* disallowed.  We will check here, and if not allowed   */
3670                 /* to execute, we return with a protection failure.      */
3671
3672                 if ((fault_type & VM_PROT_EXECUTE) &&
3673                         (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) {
3674
3675                         vm_map_verify_done(map, &version);
3676
3677                         if (real_map != map)
3678                                 vm_map_unlock(real_map);
3679
3680                         vm_fault_cleanup(object, top_page);
3681                         vm_object_deallocate(object);
3682
3683                         kr = KERN_PROTECTION_FAILURE;
3684                         goto done;
3685                 }
3686 #endif  /* ppc */
3687
3688                 if (real_map != map)
3689                         vm_map_unlock(real_map);
3690
3691                 if (original_map != map) {
3692                         vm_map_unlock_read(map);
3693                         vm_map_lock_read(original_map);
3694                         map = original_map;
3695                 }
3696                 real_map = map;
3697
3698                 laddr = vaddr;
3699                 hdelta = 0xFFFFF000;
3700                 ldelta = 0xFFFFF000;
3701
3702                 while (vm_map_lookup_entry(map, laddr, &entry)) {
3703                         if (ldelta > (laddr - entry->vme_start))
3704                                 ldelta = laddr - entry->vme_start;
3705                         if (hdelta > (entry->vme_end - laddr))
3706                                 hdelta = entry->vme_end - laddr;
3707                         if (entry->is_sub_map) {
3708
3709                                 laddr = (laddr - entry->vme_start)
3710                                                         + entry->offset;
3711                                 vm_map_lock_read(entry->object.sub_map);
3712
3713                                 if (map != real_map)
3714                                         vm_map_unlock_read(map);
3715                                 if (entry->use_pmap) {
3716                                         vm_map_unlock_read(real_map);
3717                                         real_map = entry->object.sub_map;
3718                                 }
3719                                 map = entry->object.sub_map;
3720
3721                         } else {
3722                                 break;
3723                         }
3724                 }
3725
3726                 if (vm_map_lookup_entry(map, laddr, &entry) &&
3727                                         (entry->object.vm_object != NULL) &&
3728                                         (entry->object.vm_object == object)) {
3729
3730                         int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
3731                         if (caller_pmap) {
3732                                 /*
3733                                  * Set up a block mapped area
3734                                  */
3735                                 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
3736                                 pmap_map_block(caller_pmap,
3737                                                (addr64_t)(caller_pmap_addr - ldelta),
3738                                                (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) +
3739                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
3740                                                (uint32_t)((ldelta + hdelta) >> 12), prot,
3741                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
3742                         } else {
3743                                 /*
3744                                  * Set up a block mapped area
3745                                  */
3746                                 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
3747                                 pmap_map_block(real_map->pmap,
3748                                                (addr64_t)(vaddr - ldelta),
3749                                                (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) +
3750                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
3751                                                (uint32_t)((ldelta + hdelta) >> 12), prot,
3752                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
3753                         }
3754                 }
3755         }
3756
3757         /*
3758          * Unlock everything, and return
3759          */
3760         vm_map_verify_done(map, &version);
3761         if (real_map != map)
3762                 vm_map_unlock(real_map);
3763
3764         if (m != VM_PAGE_NULL) {
3765                 PAGE_WAKEUP_DONE(m);
3766
3767                 vm_fault_cleanup(m->object, top_page);
3768         } else
3769                 vm_fault_cleanup(object, top_page);
3770
3771         vm_object_deallocate(object);
3772
3773 #undef  RELEASE_PAGE
3774
3775         kr = KERN_SUCCESS;
3776 done:
3777         thread_interrupt_level(interruptible_state);
3778
3779         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3780                               (int)((uint64_t)vaddr >> 32),
3781                               (int)vaddr,
3782                               kr,
3783                               type_of_fault,
3784                               0);
3785
3786         return (kr);
3787 }
3788
3789 /*
3790  *      vm_fault_wire:
3791  *
3792  *      Wire down a range of virtual addresses in a map.
3793  */
3794 kern_return_t
3795 vm_fault_wire(
3796         vm_map_t        map,
3797         vm_map_entry_t  entry,
3798         pmap_t          pmap,
3799         vm_map_offset_t pmap_addr)
3800 {
3801
3802         register vm_map_offset_t        va;
3803         register vm_map_offset_t        end_addr = entry->vme_end;
3804         register kern_return_t  rc;
3805
3806         assert(entry->in_transition);
3807
3808         if ((entry->object.vm_object != NULL) &&
3809                         !entry->is_sub_map &&
3810                         entry->object.vm_object->phys_contiguous) {
3811                 return KERN_SUCCESS;
3812         }
3813
3814         /*
3815          *      Inform the physical mapping system that the
3816          *      range of addresses may not fault, so that
3817          *      page tables and such can be locked down as well.
3818          */
3819
3820         pmap_pageable(pmap, pmap_addr,
3821                 pmap_addr + (end_addr - entry->vme_start), FALSE);
3822
3823         /*
3824          *      We simulate a fault to get the page and enter it
3825          *      in the physical map.
3826          */
3827
3828         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3829                 if ((rc = vm_fault_wire_fast(
3830                         map, va, entry, pmap,
3831                         pmap_addr + (va - entry->vme_start)
3832                         )) != KERN_SUCCESS) {
3833                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3834                                 (pmap == kernel_pmap) ?
3835                                         THREAD_UNINT : THREAD_ABORTSAFE,
3836                                 pmap, pmap_addr + (va - entry->vme_start));
3837                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
3838                 }
3839
3840                 if (rc != KERN_SUCCESS) {
3841                         struct vm_map_entry     tmp_entry = *entry;
3842
3843                         /* unwire wired pages */
3844                         tmp_entry.vme_end = va;
3845                         vm_fault_unwire(map,
3846                                 &tmp_entry, FALSE, pmap, pmap_addr);
3847
3848                         return rc;
3849                 }
3850         }
3851         return KERN_SUCCESS;
3852 }
3853
3854 /*
3855  *      vm_fault_unwire:
3856  *
3857  *      Unwire a range of virtual addresses in a map.
3858  */
3859 void
3860 vm_fault_unwire(
3861         vm_map_t        map,
3862         vm_map_entry_t  entry,
3863         boolean_t       deallocate,
3864         pmap_t          pmap,
3865         vm_map_offset_t pmap_addr)
3866 {
3867         register vm_map_offset_t        va;
3868         register vm_map_offset_t        end_addr = entry->vme_end;
3869         vm_object_t             object;
3870         struct vm_object_fault_info fault_info;
3871
3872         object = (entry->is_sub_map)
3873                         ? VM_OBJECT_NULL : entry->object.vm_object;
3874
3875         /*
3876          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
3877          * do anything since such memory is wired by default.  So we don't have
3878          * anything to undo here.
3879          */
3880
3881         if (object != VM_OBJECT_NULL && object->phys_contiguous)
3882                 return;
3883
3884         fault_info.interruptible = THREAD_UNINT;
3885         fault_info.behavior = entry->behavior;
3886         fault_info.user_tag = entry->alias;
3887         fault_info.lo_offset = entry->offset;
3888         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
3889         fault_info.no_cache = entry->no_cache;
3890         fault_info.stealth = TRUE;
3891         fault_info.mark_zf_absent = FALSE;
3892
3893         /*
3894          *      Since the pages are wired down, we must be able to
3895          *      get their mappings from the physical map system.
3896          */
3897
3898         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3899
3900                 if (object == VM_OBJECT_NULL) {
3901                         if (pmap) {
3902                                 pmap_change_wiring(pmap,
3903                                                    pmap_addr + (va - entry->vme_start), FALSE);
3904                         }
3905                         (void) vm_fault(map, va, VM_PROT_NONE,
3906                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
3907                 } else {
3908                         vm_prot_t       prot;
3909                         vm_page_t       result_page;
3910                         vm_page_t       top_page;
3911                         vm_object_t     result_object;
3912                         vm_fault_return_t result;
3913
3914                         if (end_addr - va > (vm_size_t) -1) {
3915                                 /* 32-bit overflow */
3916                                 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
3917                         } else {
3918                                 fault_info.cluster_size = (vm_size_t) (end_addr - va);
3919                                 assert(fault_info.cluster_size == end_addr - va);
3920                         }
3921
3922                         do {
3923                                 prot = VM_PROT_NONE;
3924
3925                                 vm_object_lock(object);
3926                                 vm_object_paging_begin(object);
3927                                 XPR(XPR_VM_FAULT,
3928                                         "vm_fault_unwire -> vm_fault_page\n",
3929                                         0,0,0,0,0);
3930                                 result = vm_fault_page(
3931                                         object,
3932                                         entry->offset + (va - entry->vme_start),
3933                                         VM_PROT_NONE, TRUE,
3934                                         &prot, &result_page, &top_page,
3935                                         (int *)0,
3936                                         NULL, map->no_zero_fill,
3937                                         FALSE, &fault_info);
3938                         } while (result == VM_FAULT_RETRY);
3939
3940                         /*
3941                          * If this was a mapping to a file on a device that has been forcibly
3942                          * unmounted, then we won't get a page back from vm_fault_page().  Just
3943                          * move on to the next one in case the remaining pages are mapped from
3944                          * different objects.  During a forced unmount, the object is terminated
3945                          * so the alive flag will be false if this happens.  A forced unmount will
3946                          * will occur when an external disk is unplugged before the user does an
3947                          * eject, so we don't want to panic in that situation.
3948                          */
3949
3950                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
3951                                 continue;
3952
3953                         if (result != VM_FAULT_SUCCESS)
3954                                 panic("vm_fault_unwire: failure");
3955
3956                         result_object = result_page->object;
3957
3958                         if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) {
3959                                 pmap_change_wiring(pmap,
3960                                                    pmap_addr + (va - entry->vme_start), FALSE);
3961                         }
3962                         if (deallocate) {
3963                                 assert(result_page->phys_page !=
3964                                        vm_page_fictitious_addr);
3965                                 pmap_disconnect(result_page->phys_page);
3966                                 VM_PAGE_FREE(result_page);
3967                         } else {
3968                                 if (VM_PAGE_WIRED(result_page)) {
3969                                         vm_page_lockspin_queues();
3970                                         vm_page_unwire(result_page, TRUE);
3971                                         vm_page_unlock_queues();
3972                                 }
3973                                 if(entry->zero_wired_pages) {
3974                                         pmap_zero_page(result_page->phys_page);
3975                                         entry->zero_wired_pages = FALSE;
3976                                 }
3977
3978                                 PAGE_WAKEUP_DONE(result_page);
3979                         }
3980                         vm_fault_cleanup(result_object, top_page);
3981                 }
3982         }
3983
3984         /*
3985          *      Inform the physical mapping system that the range
3986          *      of addresses may fault, so that page tables and
3987          *      such may be unwired themselves.
3988          */
3989
3990         pmap_pageable(pmap, pmap_addr,
3991                 pmap_addr + (end_addr - entry->vme_start), TRUE);
3992
3993 }
3994
3995 /*
3996  *      vm_fault_wire_fast:
3997  *
3998  *      Handle common case of a wire down page fault at the given address.
3999  *      If successful, the page is inserted into the associated physical map.
4000  *      The map entry is passed in to avoid the overhead of a map lookup.
4001  *
4002  *      NOTE: the given address should be truncated to the
4003  *      proper page address.
4004  *
4005  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
4006  *      a standard error specifying why the fault is fatal is returned.
4007  *
4008  *      The map in question must be referenced, and remains so.
4009  *      Caller has a read lock on the map.
4010  *
4011  *      This is a stripped version of vm_fault() for wiring pages.  Anything
4012  *      other than the common case will return KERN_FAILURE, and the caller
4013  *      is expected to call vm_fault().
4014  */
4015 kern_return_t
4016 vm_fault_wire_fast(
4017         __unused vm_map_t       map,
4018         vm_map_offset_t va,
4019         vm_map_entry_t  entry,
4020         pmap_t                  pmap,
4021         vm_map_offset_t pmap_addr)
4022 {
4023         vm_object_t             object;
4024         vm_object_offset_t      offset;
4025         register vm_page_t      m;
4026         vm_prot_t               prot;
4027         thread_t                thread = current_thread();
4028         int                     type_of_fault;
4029         kern_return_t           kr;
4030
4031         VM_STAT_INCR(faults);
4032
4033         if (thread != THREAD_NULL && thread->task != TASK_NULL)
4034           thread->task->faults++;
4035
4036 /*
4037  *      Recovery actions
4038  */
4039
4040 #undef  RELEASE_PAGE
4041 #define RELEASE_PAGE(m) {                               \
4042         PAGE_WAKEUP_DONE(m);                            \
4043         vm_page_lockspin_queues();                      \
4044         vm_page_unwire(m, TRUE);                        \
4045         vm_page_unlock_queues();                        \
4046 }
4047
4048
4049 #undef  UNLOCK_THINGS
4050 #define UNLOCK_THINGS   {                               \
4051         vm_object_paging_end(object);                      \
4052         vm_object_unlock(object);                          \
4053 }
4054
4055 #undef  UNLOCK_AND_DEALLOCATE
4056 #define UNLOCK_AND_DEALLOCATE   {                       \
4057         UNLOCK_THINGS;                                  \
4058         vm_object_deallocate(object);                   \
4059 }
4060 /*
4061  *      Give up and have caller do things the hard way.
4062  */
4063
4064 #define GIVE_UP {                                       \
4065         UNLOCK_AND_DEALLOCATE;                          \
4066         return(KERN_FAILURE);                           \
4067 }
4068
4069
4070         /*
4071          *      If this entry is not directly to a vm_object, bail out.
4072          */
4073         if (entry->is_sub_map)
4074                 return(KERN_FAILURE);
4075
4076         /*
4077          *      Find the backing store object and offset into it.
4078          */
4079
4080         object = entry->object.vm_object;
4081         offset = (va - entry->vme_start) + entry->offset;
4082         prot = entry->protection;
4083
4084         /*
4085          *      Make a reference to this object to prevent its
4086          *      disposal while we are messing with it.
4087          */
4088
4089         vm_object_lock(object);
4090         vm_object_reference_locked(object);
4091         vm_object_paging_begin(object);
4092
4093         /*
4094          *      INVARIANTS (through entire routine):
4095          *
4096          *      1)      At all times, we must either have the object
4097          *              lock or a busy page in some object to prevent
4098          *              some other thread from trying to bring in
4099          *              the same page.
4100          *
4101          *      2)      Once we have a busy page, we must remove it from
4102          *              the pageout queues, so that the pageout daemon
4103          *              will not grab it away.
4104          *
4105          */
4106
4107         /*
4108          *      Look for page in top-level object.  If it's not there or
4109          *      there's something going on, give up.
4110          * ENCRYPTED SWAP: use the slow fault path, since we'll need to
4111          * decrypt the page before wiring it down.
4112          */
4113         m = vm_page_lookup(object, offset);
4114         if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
4115             (m->unusual && ( m->error || m->restart || m->absent))) {
4116
4117                 GIVE_UP;
4118         }
4119         ASSERT_PAGE_DECRYPTED(m);
4120
4121         if (m->fictitious &&
4122             m->phys_page == vm_page_guard_addr) {
4123                 /*
4124                  * Guard pages are fictitious pages and are never
4125                  * entered into a pmap, so let's say it's been wired...
4126                  */
4127                 kr = KERN_SUCCESS;
4128                 goto done;
4129         }
4130
4131         /*
4132          *      Wire the page down now.  All bail outs beyond this
4133          *      point must unwire the page.
4134          */
4135
4136         vm_page_lockspin_queues();
4137         vm_page_wire(m);
4138         vm_page_unlock_queues();
4139
4140         /*
4141          *      Mark page busy for other threads.
4142          */
4143         assert(!m->busy);
4144         m->busy = TRUE;
4145         assert(!m->absent);
4146
4147         /*
4148          *      Give up if the page is being written and there's a copy object
4149          */
4150         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
4151                 RELEASE_PAGE(m);
4152                 GIVE_UP;
4153         }
4154
4155         /*
4156          *      Put this page into the physical map.
4157          */
4158         type_of_fault = DBG_CACHE_HIT_FAULT;
4159         kr = vm_fault_enter(m,
4160                             pmap,
4161                             pmap_addr,
4162                             prot,
4163                             TRUE,
4164                             FALSE,
4165                             FALSE,
4166                             &type_of_fault);
4167
4168 done:
4169         /*
4170          *      Unlock everything, and return
4171          */
4172
4173         PAGE_WAKEUP_DONE(m);
4174         UNLOCK_AND_DEALLOCATE;
4175
4176         return kr;
4177
4178 }
4179
4180 /*
4181  *      Routine:        vm_fault_copy_cleanup
4182  *      Purpose:
4183  *              Release a page used by vm_fault_copy.
4184  */
4185
4186 void
4187 vm_fault_copy_cleanup(
4188         vm_page_t       page,
4189         vm_page_t       top_page)
4190 {
4191         vm_object_t     object = page->object;
4192
4193         vm_object_lock(object);
4194         PAGE_WAKEUP_DONE(page);
4195         if (!page->active && !page->inactive && !page->throttled) {
4196                 vm_page_lockspin_queues();
4197                 if (!page->active && !page->inactive && !page->throttled)
4198                         vm_page_activate(page);
4199                 vm_page_unlock_queues();
4200         }
4201         vm_fault_cleanup(object, top_page);
4202 }
4203
4204 void
4205 vm_fault_copy_dst_cleanup(
4206         vm_page_t       page)
4207 {
4208         vm_object_t     object;
4209
4210         if (page != VM_PAGE_NULL) {
4211                 object = page->object;
4212                 vm_object_lock(object);
4213                 vm_page_lockspin_queues();
4214                 vm_page_unwire(page, TRUE);
4215                 vm_page_unlock_queues();
4216                 vm_object_paging_end(object);
4217                 vm_object_unlock(object);
4218         }
4219 }
4220
4221 /*
4222  *      Routine:        vm_fault_copy
4223  *
4224  *      Purpose:
4225  *              Copy pages from one virtual memory object to another --
4226  *              neither the source nor destination pages need be resident.
4227  *
4228  *              Before actually copying a page, the version associated with
4229  *              the destination address map wil be verified.
4230  *
4231  *      In/out conditions:
4232  *              The caller must hold a reference, but not a lock, to
4233  *              each of the source and destination objects and to the
4234  *              destination map.
4235  *
4236  *      Results:
4237  *              Returns KERN_SUCCESS if no errors were encountered in
4238  *              reading or writing the data.  Returns KERN_INTERRUPTED if
4239  *              the operation was interrupted (only possible if the
4240  *              "interruptible" argument is asserted).  Other return values
4241  *              indicate a permanent error in copying the data.
4242  *
4243  *              The actual amount of data copied will be returned in the
4244  *              "copy_size" argument.  In the event that the destination map
4245  *              verification failed, this amount may be less than the amount
4246  *              requested.
4247  */
4248 kern_return_t
4249 vm_fault_copy(
4250         vm_object_t             src_object,
4251         vm_object_offset_t      src_offset,
4252         vm_map_size_t           *copy_size,             /* INOUT */
4253         vm_object_t             dst_object,
4254         vm_object_offset_t      dst_offset,
4255         vm_map_t                dst_map,
4256         vm_map_version_t         *dst_version,
4257         int                     interruptible)
4258 {
4259         vm_page_t               result_page;
4260
4261         vm_page_t               src_page;
4262         vm_page_t               src_top_page;
4263         vm_prot_t               src_prot;
4264
4265         vm_page_t               dst_page;
4266         vm_page_t               dst_top_page;
4267         vm_prot_t               dst_prot;
4268
4269         vm_map_size_t           amount_left;
4270         vm_object_t             old_copy_object;
4271         kern_return_t           error = 0;
4272         vm_fault_return_t       result;
4273
4274         vm_map_size_t           part_size;
4275         struct vm_object_fault_info fault_info_src;
4276         struct vm_object_fault_info fault_info_dst;
4277
4278         /*
4279          * In order not to confuse the clustered pageins, align
4280          * the different offsets on a page boundary.
4281          */
4282
4283 #define RETURN(x)                                       \
4284         MACRO_BEGIN                                     \
4285         *copy_size -= amount_left;                      \
4286         MACRO_RETURN(x);                                \
4287         MACRO_END
4288
4289         amount_left = *copy_size;
4290
4291         fault_info_src.interruptible = interruptible;
4292         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
4293         fault_info_src.user_tag  = 0;
4294         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
4295         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
4296         fault_info_src.no_cache   = FALSE;
4297         fault_info_src.stealth = TRUE;
4298         fault_info_src.mark_zf_absent = FALSE;
4299
4300         fault_info_dst.interruptible = interruptible;
4301         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
4302         fault_info_dst.user_tag  = 0;
4303         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
4304         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
4305         fault_info_dst.no_cache   = FALSE;
4306         fault_info_dst.stealth = TRUE;
4307         fault_info_dst.mark_zf_absent = FALSE;
4308
4309         do { /* while (amount_left > 0) */
4310                 /*
4311                  * There may be a deadlock if both source and destination
4312                  * pages are the same. To avoid this deadlock, the copy must
4313                  * start by getting the destination page in order to apply
4314                  * COW semantics if any.
4315                  */
4316
4317         RetryDestinationFault: ;
4318
4319                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
4320
4321                 vm_object_lock(dst_object);
4322                 vm_object_paging_begin(dst_object);
4323
4324                 if (amount_left > (vm_size_t) -1) {
4325                         /* 32-bit overflow */
4326                         fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4327                 } else {
4328                         fault_info_dst.cluster_size = (vm_size_t) amount_left;
4329                         assert(fault_info_dst.cluster_size == amount_left);
4330                 }
4331
4332                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
4333                 result = vm_fault_page(dst_object,
4334                                        vm_object_trunc_page(dst_offset),
4335                                        VM_PROT_WRITE|VM_PROT_READ,
4336                                        FALSE,
4337                                        &dst_prot, &dst_page, &dst_top_page,
4338                                        (int *)0,
4339                                        &error,
4340                                        dst_map->no_zero_fill,
4341                                        FALSE, &fault_info_dst);
4342                 switch (result) {
4343                 case VM_FAULT_SUCCESS:
4344                         break;
4345                 case VM_FAULT_RETRY:
4346                         goto RetryDestinationFault;
4347                 case VM_FAULT_MEMORY_SHORTAGE:
4348                         if (vm_page_wait(interruptible))
4349                                 goto RetryDestinationFault;
4350                         /* fall thru */
4351                 case VM_FAULT_INTERRUPTED:
4352                         RETURN(MACH_SEND_INTERRUPTED);
4353                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4354                         /* success but no VM page: fail the copy */
4355                         vm_object_paging_end(dst_object);
4356                         vm_object_unlock(dst_object);
4357                         /*FALLTHROUGH*/
4358                 case VM_FAULT_MEMORY_ERROR:
4359                         if (error)
4360                                 return (error);
4361                         else
4362                                 return(KERN_MEMORY_ERROR);
4363                 default:
4364                         panic("vm_fault_copy: unexpected error 0x%x from "
4365                               "vm_fault_page()\n", result);
4366                 }
4367                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
4368
4369                 old_copy_object = dst_page->object->copy;
4370
4371                 /*
4372                  * There exists the possiblity that the source and
4373                  * destination page are the same.  But we can't
4374                  * easily determine that now.  If they are the
4375                  * same, the call to vm_fault_page() for the
4376                  * destination page will deadlock.  To prevent this we
4377                  * wire the page so we can drop busy without having
4378                  * the page daemon steal the page.  We clean up the
4379                  * top page  but keep the paging reference on the object
4380                  * holding the dest page so it doesn't go away.
4381                  */
4382
4383                 vm_page_lockspin_queues();
4384                 vm_page_wire(dst_page);
4385                 vm_page_unlock_queues();
4386                 PAGE_WAKEUP_DONE(dst_page);
4387                 vm_object_unlock(dst_page->object);
4388
4389                 if (dst_top_page != VM_PAGE_NULL) {
4390                         vm_object_lock(dst_object);
4391                         VM_PAGE_FREE(dst_top_page);
4392                         vm_object_paging_end(dst_object);
4393                         vm_object_unlock(dst_object);
4394                 }
4395
4396         RetrySourceFault: ;
4397
4398                 if (src_object == VM_OBJECT_NULL) {
4399                         /*
4400                          *      No source object.  We will just
4401                          *      zero-fill the page in dst_object.
4402                          */
4403                         src_page = VM_PAGE_NULL;
4404                         result_page = VM_PAGE_NULL;
4405                 } else {
4406                         vm_object_lock(src_object);
4407                         src_page = vm_page_lookup(src_object,
4408                                                   vm_object_trunc_page(src_offset));
4409                         if (src_page == dst_page) {
4410                                 src_prot = dst_prot;
4411                                 result_page = VM_PAGE_NULL;
4412                         } else {
4413                                 src_prot = VM_PROT_READ;
4414                                 vm_object_paging_begin(src_object);
4415
4416                                 if (amount_left > (vm_size_t) -1) {
4417                                         /* 32-bit overflow */
4418                                         fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4419                                 } else {
4420                                         fault_info_src.cluster_size = (vm_size_t) amount_left;
4421                                         assert(fault_info_src.cluster_size == amount_left);
4422                                 }
4423
4424                                 XPR(XPR_VM_FAULT,
4425                                         "vm_fault_copy(2) -> vm_fault_page\n",
4426                                         0,0,0,0,0);
4427                                 result = vm_fault_page(
4428                                         src_object,
4429                                         vm_object_trunc_page(src_offset),
4430                                         VM_PROT_READ, FALSE,
4431                                         &src_prot,
4432                                         &result_page, &src_top_page,
4433                                         (int *)0, &error, FALSE,
4434                                         FALSE, &fault_info_src);
4435
4436                                 switch (result) {
4437                                 case VM_FAULT_SUCCESS:
4438                                         break;
4439                                 case VM_FAULT_RETRY:
4440                                         goto RetrySourceFault;
4441                                 case VM_FAULT_MEMORY_SHORTAGE:
4442                                         if (vm_page_wait(interruptible))
4443                                                 goto RetrySourceFault;
4444                                         /* fall thru */
4445                                 case VM_FAULT_INTERRUPTED:
4446                                         vm_fault_copy_dst_cleanup(dst_page);
4447                                         RETURN(MACH_SEND_INTERRUPTED);
4448                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4449                                         /* success but no VM page: fail */
4450                                         vm_object_paging_end(src_object);
4451                                         vm_object_unlock(src_object);
4452                                         /*FALLTHROUGH*/
4453                                 case VM_FAULT_MEMORY_ERROR:
4454                                         vm_fault_copy_dst_cleanup(dst_page);
4455                                         if (error)
4456                                                 return (error);
4457                                         else
4458                                                 return(KERN_MEMORY_ERROR);
4459                                 default:
4460                                         panic("vm_fault_copy(2): unexpected "
4461                                               "error 0x%x from "
4462                                               "vm_fault_page()\n", result);
4463                                 }
4464
4465
4466                                 assert((src_top_page == VM_PAGE_NULL) ==
4467                                        (result_page->object == src_object));
4468                         }
4469                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
4470                         vm_object_unlock(result_page->object);
4471                 }
4472
4473                 if (!vm_map_verify(dst_map, dst_version)) {
4474                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
4475                                 vm_fault_copy_cleanup(result_page, src_top_page);
4476                         vm_fault_copy_dst_cleanup(dst_page);
4477                         break;
4478                 }
4479
4480                 vm_object_lock(dst_page->object);
4481
4482                 if (dst_page->object->copy != old_copy_object) {
4483                         vm_object_unlock(dst_page->object);
4484                         vm_map_verify_done(dst_map, dst_version);
4485                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
4486                                 vm_fault_copy_cleanup(result_page, src_top_page);
4487                         vm_fault_copy_dst_cleanup(dst_page);
4488                         break;
4489                 }
4490                 vm_object_unlock(dst_page->object);
4491
4492                 /*
4493                  *      Copy the page, and note that it is dirty
4494                  *      immediately.
4495                  */
4496
4497                 if (!page_aligned(src_offset) ||
4498                         !page_aligned(dst_offset) ||
4499                         !page_aligned(amount_left)) {
4500
4501                         vm_object_offset_t      src_po,
4502                                                 dst_po;
4503
4504                         src_po = src_offset - vm_object_trunc_page(src_offset);
4505                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
4506
4507                         if (dst_po > src_po) {
4508                                 part_size = PAGE_SIZE - dst_po;
4509                         } else {
4510                                 part_size = PAGE_SIZE - src_po;
4511                         }
4512                         if (part_size > (amount_left)){
4513                                 part_size = amount_left;
4514                         }
4515
4516                         if (result_page == VM_PAGE_NULL) {
4517                                 assert((vm_offset_t) dst_po == dst_po);
4518                                 assert((vm_size_t) part_size == part_size);
4519                                 vm_page_part_zero_fill(dst_page,
4520                                                        (vm_offset_t) dst_po,
4521                                                        (vm_size_t) part_size);
4522                         } else {
4523                                 assert((vm_offset_t) src_po == src_po);
4524                                 assert((vm_offset_t) dst_po == dst_po);
4525                                 assert((vm_size_t) part_size == part_size);
4526                                 vm_page_part_copy(result_page,
4527                                                   (vm_offset_t) src_po,
4528                                                   dst_page,
4529                                                   (vm_offset_t) dst_po,
4530                                                   (vm_size_t)part_size);
4531                                 if(!dst_page->dirty){
4532                                         vm_object_lock(dst_object);
4533                                         dst_page->dirty = TRUE;
4534                                         vm_object_unlock(dst_page->object);
4535                                 }
4536
4537                         }
4538                 } else {
4539                         part_size = PAGE_SIZE;
4540
4541                         if (result_page == VM_PAGE_NULL)
4542                                 vm_page_zero_fill(dst_page);
4543                         else{
4544                                 vm_page_copy(result_page, dst_page);
4545                                 if(!dst_page->dirty){
4546                                         vm_object_lock(dst_object);
4547                                         dst_page->dirty = TRUE;
4548                                         vm_object_unlock(dst_page->object);
4549                                 }
4550                         }
4551
4552                 }
4553
4554                 /*
4555                  *      Unlock everything, and return
4556                  */
4557
4558                 vm_map_verify_done(dst_map, dst_version);
4559
4560                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4561                         vm_fault_copy_cleanup(result_page, src_top_page);
4562                 vm_fault_copy_dst_cleanup(dst_page);
4563
4564                 amount_left -= part_size;
4565                 src_offset += part_size;
4566                 dst_offset += part_size;
4567         } while (amount_left > 0);
4568
4569         RETURN(KERN_SUCCESS);
4570 #undef  RETURN
4571
4572         /*NOTREACHED*/
4573 }
4574
4575 #if     VM_FAULT_CLASSIFY
4576 /*
4577  *      Temporary statistics gathering support.
4578  */
4579
4580 /*
4581  *      Statistics arrays:
4582  */
4583 #define VM_FAULT_TYPES_MAX      5
4584 #define VM_FAULT_LEVEL_MAX      8
4585
4586 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4587
4588 #define VM_FAULT_TYPE_ZERO_FILL 0
4589 #define VM_FAULT_TYPE_MAP_IN    1
4590 #define VM_FAULT_TYPE_PAGER     2
4591 #define VM_FAULT_TYPE_COPY      3
4592 #define VM_FAULT_TYPE_OTHER     4
4593
4594
4595 void
4596 vm_fault_classify(vm_object_t           object,
4597                   vm_object_offset_t    offset,
4598                   vm_prot_t             fault_type)
4599 {
4600         int             type, level = 0;
4601         vm_page_t       m;
4602
4603         while (TRUE) {
4604                 m = vm_page_lookup(object, offset);
4605                 if (m != VM_PAGE_NULL) {
4606                         if (m->busy || m->error || m->restart || m->absent) {
4607                                 type = VM_FAULT_TYPE_OTHER;
4608                                 break;
4609                         }
4610                         if (((fault_type & VM_PROT_WRITE) == 0) ||
4611                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4612                                 type = VM_FAULT_TYPE_MAP_IN;
4613                                 break;
4614                         }
4615                         type = VM_FAULT_TYPE_COPY;
4616                         break;
4617                 }
4618                 else {
4619                         if (object->pager_created) {
4620                                 type = VM_FAULT_TYPE_PAGER;
4621                                 break;
4622                         }
4623                         if (object->shadow == VM_OBJECT_NULL) {
4624                                 type = VM_FAULT_TYPE_ZERO_FILL;
4625                                 break;
4626                         }
4627
4628                         offset += object->shadow_offset;
4629                         object = object->shadow;
4630                         level++;
4631                         continue;
4632                 }
4633         }
4634
4635         if (level > VM_FAULT_LEVEL_MAX)
4636                 level = VM_FAULT_LEVEL_MAX;
4637
4638         vm_fault_stats[type][level] += 1;
4639
4640         return;
4641 }
4642
4643 /* cleanup routine to call from debugger */
4644
4645 void
4646 vm_fault_classify_init(void)
4647 {
4648         int type, level;
4649
4650         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4651                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4652                         vm_fault_stats[type][level] = 0;
4653                 }
4654         }
4655
4656         return;
4657 }
4658 #endif  /* VM_FAULT_CLASSIFY */
4659
4660
4661 extern int cs_validation;
4662
4663 void
4664 vm_page_validate_cs_mapped(
4665         vm_page_t       page,
4666         const void      *kaddr)
4667 {
4668         vm_object_t             object;
4669         vm_object_offset_t      offset;
4670         kern_return_t           kr;
4671         memory_object_t         pager;
4672         void                    *blobs;
4673         boolean_t               validated, tainted;
4674
4675         assert(page->busy);
4676         vm_object_lock_assert_exclusive(page->object);
4677
4678         if (!cs_validation) {
4679                 return;
4680         }
4681
4682         if (page->wpmapped && !page->cs_tainted) {
4683                 /*
4684                  * This page was mapped for "write" access sometime in the
4685                  * past and could still be modifiable in the future.
4686                  * Consider it tainted.
4687                  * [ If the page was already found to be "tainted", no
4688                  * need to re-validate. ]
4689                  */
4690                 page->cs_validated = TRUE;
4691                 page->cs_tainted = TRUE;
4692                 if (cs_debug) {
4693                         printf("CODESIGNING: vm_page_validate_cs: "
4694                                "page %p obj %p off 0x%llx "
4695                                "was modified\n",
4696                                page, page->object, page->offset);
4697                 }
4698                 vm_cs_validated_dirtied++;
4699         }
4700
4701         if (page->cs_validated) {
4702                 return;
4703         }
4704
4705         vm_cs_validates++;
4706
4707         object = page->object;
4708         assert(object->code_signed);
4709         offset = page->offset;
4710
4711         if (!object->alive || object->terminating || object->pager == NULL) {
4712                 /*
4713                  * The object is terminating and we don't have its pager
4714                  * so we can't validate the data...
4715                  */
4716                 return;
4717         }
4718         /*
4719          * Since we get here to validate a page that was brought in by
4720          * the pager, we know that this pager is all setup and ready
4721          * by now.
4722          */
4723         assert(!object->internal);
4724         assert(object->pager != NULL);
4725         assert(object->pager_ready);
4726
4727         pager = object->pager;
4728         assert(object->paging_in_progress);
4729         kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
4730         if (kr != KERN_SUCCESS) {
4731                 blobs = NULL;
4732         }
4733
4734         /* verify the SHA1 hash for this page */
4735         validated = cs_validate_page(blobs,
4736                                      offset + object->paging_offset,
4737                                      (const void *)kaddr,
4738                                      &tainted);
4739
4740         page->cs_validated = validated;
4741         if (validated) {
4742                 page->cs_tainted = tainted;
4743         }
4744 }
4745
4746 void
4747 vm_page_validate_cs(
4748         vm_page_t       page)
4749 {
4750         vm_object_t             object;
4751         vm_object_offset_t      offset;
4752         vm_map_offset_t         koffset;
4753         vm_map_size_t           ksize;
4754         vm_offset_t             kaddr;
4755         kern_return_t           kr;
4756         boolean_t               busy_page;
4757
4758         vm_object_lock_assert_held(page->object);
4759
4760         if (!cs_validation) {
4761                 return;
4762         }
4763
4764         if (page->wpmapped && !page->cs_tainted) {
4765                 vm_object_lock_assert_exclusive(page->object);
4766
4767                 /*
4768                  * This page was mapped for "write" access sometime in the
4769                  * past and could still be modifiable in the future.
4770                  * Consider it tainted.
4771                  * [ If the page was already found to be "tainted", no
4772                  * need to re-validate. ]
4773                  */
4774                 page->cs_validated = TRUE;
4775                 page->cs_tainted = TRUE;
4776                 if (cs_debug) {
4777                         printf("CODESIGNING: vm_page_validate_cs: "
4778                                "page %p obj %p off 0x%llx "
4779                                "was modified\n",
4780                                page, page->object, page->offset);
4781                 }
4782                 vm_cs_validated_dirtied++;
4783         }
4784
4785         if (page->cs_validated) {
4786                 return;
4787         }
4788
4789         vm_object_lock_assert_exclusive(page->object);
4790
4791         object = page->object;
4792         assert(object->code_signed);
4793         offset = page->offset;
4794
4795         busy_page = page->busy;
4796         if (!busy_page) {
4797                 /* keep page busy while we map (and unlock) the VM object */
4798                 page->busy = TRUE;
4799         }
4800
4801         /*
4802          * Take a paging reference on the VM object
4803          * to protect it from collapse or bypass,
4804          * and keep it from disappearing too.
4805          */
4806         vm_object_paging_begin(object);
4807
4808         /* map the page in the kernel address space */
4809         koffset = 0;
4810         ksize = PAGE_SIZE_64;
4811         kr = vm_paging_map_object(&koffset,
4812                                   page,
4813                                   object,
4814                                   offset,
4815                                   &ksize,
4816                                   VM_PROT_READ,
4817                                   FALSE); /* can't unlock object ! */
4818         if (kr != KERN_SUCCESS) {
4819                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
4820         }
4821         kaddr = CAST_DOWN(vm_offset_t, koffset);
4822
4823         /* validate the mapped page */
4824         vm_page_validate_cs_mapped(page, (const void *) kaddr);
4825
4826         assert(page->busy);
4827         assert(object == page->object);
4828         vm_object_lock_assert_exclusive(object);
4829
4830         if (!busy_page) {
4831                 PAGE_WAKEUP_DONE(page);
4832         }
4833         if (koffset != 0) {
4834                 /* unmap the map from the kernel address space */
4835                 vm_paging_unmap_object(object, koffset, koffset + ksize);
4836                 koffset = 0;
4837                 ksize = 0;
4838                 kaddr = 0;
4839         }
4840         vm_object_paging_end(object);
4841 }