osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <mach_kdb.h>
  68 #include <libkern/OSAtomic.h>
  69
  70 #include <mach/mach_types.h>
  71 #include <mach/kern_return.h>
  72 #include <mach/message.h>       /* for error codes */
  73 #include <mach/vm_param.h>
  74 #include <mach/vm_behavior.h>
  75 #include <mach/memory_object.h>
  76                                 /* For memory_object_data_{request,unlock} */
  77 #include <mach/sdt.h>
  78
  79 #include <kern/kern_types.h>
  80 #include <kern/host_statistics.h>
  81 #include <kern/counters.h>
  82 #include <kern/task.h>
  83 #include <kern/thread.h>
  84 #include <kern/sched_prim.h>
  85 #include <kern/host.h>
  86 #include <kern/xpr.h>
  87 #include <kern/mach_param.h>
  88 #include <kern/macro_help.h>
  89 #include <kern/zalloc.h>
  90 #include <kern/misc_protos.h>
  91
  92 #include <ppc/proc_reg.h>
  93
  94 #include <vm/vm_fault.h>
  95 #include <vm/vm_map.h>
  96 #include <vm/vm_object.h>
  97 #include <vm/vm_page.h>
  98 #include <vm/vm_kern.h>
  99 #include <vm/pmap.h>
 100 #include <vm/vm_pageout.h>
 101 #include <vm/vm_protos.h>
 102 #include <vm/vm_external.h>
 103 #include <vm/memory_object.h>
 104 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 105
 106 #include <sys/kdebug.h>
 107
 108 #define VM_FAULT_CLASSIFY       0
 109
 110 /* Zero-filled pages are marked "m->zero_fill" and put on the
 111  * special zero-fill inactive queue  only if they belong to
 112  * an object at least this big.
 113  */
 114 #define VM_ZF_OBJECT_SIZE_THRESHOLD     (0x200000)
 115
 116 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 117
 118 int     vm_object_pagein_throttle = 16;
 119
 120 extern int cs_debug;
 121
 122 #if     MACH_KDB
 123 extern struct db_watchpoint *db_watchpoint_list;
 124 #endif  /* MACH_KDB */
 125
 126
 127 /* Forward declarations of internal routines. */
 128 extern kern_return_t vm_fault_wire_fast(
 129                                 vm_map_t        map,
 130                                 vm_map_offset_t va,
 131                                 vm_map_entry_t  entry,
 132                                 pmap_t          pmap,
 133                                 vm_map_offset_t pmap_addr);
 134
 135 extern void vm_fault_continue(void);
 136
 137 extern void vm_fault_copy_cleanup(
 138                                 vm_page_t       page,
 139                                 vm_page_t       top_page);
 140
 141 extern void vm_fault_copy_dst_cleanup(
 142                                 vm_page_t       page);
 143
 144 #if     VM_FAULT_CLASSIFY
 145 extern void vm_fault_classify(vm_object_t       object,
 146                           vm_object_offset_t    offset,
 147                           vm_prot_t             fault_type);
 148
 149 extern void vm_fault_classify_init(void);
 150 #endif
 151
 152
 153 unsigned long vm_cs_validates = 0;
 154 unsigned long vm_cs_revalidates = 0;
 155 unsigned long vm_cs_query_modified = 0;
 156 unsigned long vm_cs_validated_dirtied = 0;
 157
 158 /*
 159  *      Routine:        vm_fault_init
 160  *      Purpose:
 161  *              Initialize our private data structures.
 162  */
 163 void
 164 vm_fault_init(void)
 165 {
 166 }
 167
 168 /*
 169  *      Routine:        vm_fault_cleanup
 170  *      Purpose:
 171  *              Clean up the result of vm_fault_page.
 172  *      Results:
 173  *              The paging reference for "object" is released.
 174  *              "object" is unlocked.
 175  *              If "top_page" is not null,  "top_page" is
 176  *              freed and the paging reference for the object
 177  *              containing it is released.
 178  *
 179  *      In/out conditions:
 180  *              "object" must be locked.
 181  */
 182 void
 183 vm_fault_cleanup(
 184         register vm_object_t    object,
 185         register vm_page_t      top_page)
 186 {
 187         vm_object_paging_end(object);
 188         vm_object_unlock(object);
 189
 190         if (top_page != VM_PAGE_NULL) {
 191                 object = top_page->object;
 192
 193                 vm_object_lock(object);
 194                 VM_PAGE_FREE(top_page);
 195                 vm_object_paging_end(object);
 196                 vm_object_unlock(object);
 197         }
 198 }
 199
 200 #if     MACH_CLUSTER_STATS
 201 #define MAXCLUSTERPAGES 16
 202 struct {
 203         unsigned long pages_in_cluster;
 204         unsigned long pages_at_higher_offsets;
 205         unsigned long pages_at_lower_offsets;
 206 } cluster_stats_in[MAXCLUSTERPAGES];
 207 #define CLUSTER_STAT(clause)    clause
 208 #define CLUSTER_STAT_HIGHER(x)  \
 209         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 210 #define CLUSTER_STAT_LOWER(x)   \
 211          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 212 #define CLUSTER_STAT_CLUSTER(x) \
 213         ((cluster_stats_in[(x)].pages_in_cluster)++)
 214 #else   /* MACH_CLUSTER_STATS */
 215 #define CLUSTER_STAT(clause)
 216 #endif  /* MACH_CLUSTER_STATS */
 217
 218 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 219
 220
 221 boolean_t       vm_page_deactivate_behind = TRUE;
 222 /*
 223  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 224  */
 225 int vm_default_ahead = 0;
 226 int vm_default_behind = MAX_UPL_TRANSFER;
 227
 228 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 229
 230 /*
 231  * vm_page_is_sequential
 232  *
 233  * Determine if sequential access is in progress
 234  * in accordance with the behavior specified.
 235  * Update state to indicate current access pattern.
 236  *
 237  * object must have at least the shared lock held
 238  */
 239 static
 240 void
 241 vm_fault_is_sequential(
 242         vm_object_t             object,
 243         vm_object_offset_t      offset,
 244         vm_behavior_t           behavior)
 245 {
 246         vm_object_offset_t      last_alloc;
 247         int                     sequential;
 248         int                     orig_sequential;
 249
 250         last_alloc = object->last_alloc;
 251         sequential = object->sequential;
 252         orig_sequential = sequential;
 253
 254         switch (behavior) {
 255         case VM_BEHAVIOR_RANDOM:
 256                 /*
 257                  * reset indicator of sequential behavior
 258                  */
 259                 sequential = 0;
 260                 break;
 261
 262         case VM_BEHAVIOR_SEQUENTIAL:
 263                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 264                         /*
 265                          * advance indicator of sequential behavior
 266                          */
 267                         if (sequential < MAX_SEQUENTIAL_RUN)
 268                                 sequential += PAGE_SIZE;
 269                 } else {
 270                         /*
 271                          * reset indicator of sequential behavior
 272                          */
 273                         sequential = 0;
 274                 }
 275                 break;
 276
 277         case VM_BEHAVIOR_RSEQNTL:
 278                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 279                         /*
 280                          * advance indicator of sequential behavior
 281                          */
 282                         if (sequential > -MAX_SEQUENTIAL_RUN)
 283                                 sequential -= PAGE_SIZE;
 284                 } else {
 285                         /*
 286                          * reset indicator of sequential behavior
 287                          */
 288                         sequential = 0;
 289                 }
 290                 break;
 291
 292         case VM_BEHAVIOR_DEFAULT:
 293         default:
 294                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 295                         /*
 296                          * advance indicator of sequential behavior
 297                          */
 298                         if (sequential < 0)
 299                                 sequential = 0;
 300                         if (sequential < MAX_SEQUENTIAL_RUN)
 301                                 sequential += PAGE_SIZE;
 302
 303                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 304                         /*
 305                          * advance indicator of sequential behavior
 306                          */
 307                         if (sequential > 0)
 308                                 sequential = 0;
 309                         if (sequential > -MAX_SEQUENTIAL_RUN)
 310                                 sequential -= PAGE_SIZE;
 311                 } else {
 312                         /*
 313                          * reset indicator of sequential behavior
 314                          */
 315                         sequential = 0;
 316                 }
 317                 break;
 318         }
 319         if (sequential != orig_sequential) {
 320                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 321                         /*
 322                          * if someone else has already updated object->sequential
 323                          * don't bother trying to update it or object->last_alloc
 324                          */
 325                         return;
 326                 }
 327         }
 328         /*
 329          * I'd like to do this with a OSCompareAndSwap64, but that
 330          * doesn't exist for PPC...  however, it shouldn't matter
 331          * that much... last_alloc is maintained so that we can determine
 332          * if a sequential access pattern is taking place... if only
 333          * one thread is banging on this object, no problem with the unprotected
 334          * update... if 2 or more threads are banging away, we run the risk of
 335          * someone seeing a mangled update... however, in the face of multiple
 336          * accesses, no sequential access pattern can develop anyway, so we
 337          * haven't lost any real info.
 338          */
 339         object->last_alloc = offset;
 340 }
 341
 342
 343 /*
 344  * vm_page_deactivate_behind
 345  *
 346  * Determine if sequential access is in progress
 347  * in accordance with the behavior specified.  If
 348  * so, compute a potential page to deactivate and
 349  * deactivate it.
 350  *
 351  * object must be locked.
 352  *
 353  * return TRUE if we actually deactivate a page
 354  */
 355 static
 356 boolean_t
 357 vm_fault_deactivate_behind(
 358         vm_object_t             object,
 359         vm_object_offset_t      offset,
 360         vm_behavior_t           behavior)
 361 {
 362         vm_page_t       m = NULL;
 363         int             sequential_run;
 364         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 365
 366 #if TRACEFAULTPAGE
 367         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 368 #endif
 369
 370         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 371                 /*
 372                  * Do not deactivate pages from the kernel object: they
 373                  * are not intended to become pageable.
 374                  * or we've disabled the deactivate behind mechanism
 375                  */
 376                 return FALSE;
 377         }
 378         if ((sequential_run = object->sequential)) {
 379                   if (sequential_run < 0) {
 380                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 381                           sequential_run = 0 - sequential_run;
 382                   } else {
 383                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 384                   }
 385         }
 386         switch (behavior) {
 387         case VM_BEHAVIOR_RANDOM:
 388                 break;
 389         case VM_BEHAVIOR_SEQUENTIAL:
 390                 if (sequential_run >= (int)PAGE_SIZE)
 391                         m = vm_page_lookup(object, offset - PAGE_SIZE_64);
 392                 break;
 393         case VM_BEHAVIOR_RSEQNTL:
 394                 if (sequential_run >= (int)PAGE_SIZE)
 395                         m = vm_page_lookup(object, offset + PAGE_SIZE_64);
 396                 break;
 397         case VM_BEHAVIOR_DEFAULT:
 398         default:
 399         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 400
 401                 /*
 402                  * determine if the run of sequential accesss has been
 403                  * long enough on an object with default access behavior
 404                  * to consider it for deactivation
 405                  */
 406                 if ((uint64_t)sequential_run >= behind) {
 407                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 408                                 if (offset >= behind)
 409                                         m = vm_page_lookup(object, offset - behind);
 410                         } else {
 411                                 if (offset < -behind)
 412                                         m = vm_page_lookup(object, offset + behind);
 413                         }
 414                 }
 415                 break;
 416         }
 417         }
 418         if (m) {
 419                 if (!m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
 420                         pmap_clear_reference(m->phys_page);
 421                         m->deactivated = TRUE;
 422 #if TRACEFAULTPAGE
 423                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 424 #endif
 425                         return TRUE;
 426                 }
 427         }
 428         return FALSE;
 429 }
 430
 431
 432 /*
 433  * check for various conditions that would
 434  * prevent us from creating a ZF page...
 435  * cleanup is based on being called from vm_fault_page
 436  *
 437  * object must be locked
 438  * object == m->object
 439  */
 440 static vm_fault_return_t
 441 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
 442 {
 443         if (object->shadow_severed) {
 444                 /*
 445                  * the shadow chain was severed
 446                  * just have to return an error at this point
 447                  */
 448                 if (m != VM_PAGE_NULL)
 449                         VM_PAGE_FREE(m);
 450                 vm_fault_cleanup(object, first_m);
 451
 452                 thread_interrupt_level(interruptible_state);
 453
 454                 return (VM_FAULT_MEMORY_ERROR);
 455         }
 456         if (vm_backing_store_low) {
 457                 /*
 458                  * are we protecting the system from
 459                  * backing store exhaustion.  If so
 460                  * sleep unless we are privileged.
 461                  */
 462                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 463
 464                         if (m != VM_PAGE_NULL)
 465                                 VM_PAGE_FREE(m);
 466                         vm_fault_cleanup(object, first_m);
 467
 468                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 469
 470                         thread_block(THREAD_CONTINUE_NULL);
 471                         thread_interrupt_level(interruptible_state);
 472
 473                         return (VM_FAULT_RETRY);
 474                 }
 475         }
 476         if (VM_PAGE_ZFILL_THROTTLED()) {
 477                 /*
 478                  * we're throttling zero-fills...
 479                  * treat this as if we couldn't grab a page
 480                  */
 481                 if (m != VM_PAGE_NULL)
 482                         VM_PAGE_FREE(m);
 483                 vm_fault_cleanup(object, first_m);
 484
 485                 thread_interrupt_level(interruptible_state);
 486
 487                 return (VM_FAULT_MEMORY_SHORTAGE);
 488         }
 489         return (VM_FAULT_SUCCESS);
 490 }
 491
 492
 493 /*
 494  * do the work to zero fill a page and
 495  * inject it into the correct paging queue
 496  *
 497  * m->object must be locked
 498  * page queue lock must NOT be held
 499  */
 500 static int
 501 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 502 {
 503         int my_fault = DBG_ZERO_FILL_FAULT;
 504
 505         /*
 506          * This is is a zero-fill page fault...
 507          *
 508          * Checking the page lock is a waste of
 509          * time;  this page was absent, so
 510          * it can't be page locked by a pager.
 511          *
 512          * we also consider it undefined
 513          * with respect to instruction
 514          * execution.  i.e. it is the responsibility
 515          * of higher layers to call for an instruction
 516          * sync after changing the contents and before
 517          * sending a program into this area.  We
 518          * choose this approach for performance
 519          */
 520         m->pmapped = TRUE;
 521
 522         m->cs_validated = FALSE;
 523         m->cs_tainted = FALSE;
 524
 525         if (no_zero_fill == TRUE)
 526                 my_fault = DBG_NZF_PAGE_FAULT;
 527         else {
 528                 vm_page_zero_fill(m);
 529
 530                 VM_STAT_INCR(zero_fill_count);
 531                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 532         }
 533         assert(!m->laundry);
 534         assert(m->object != kernel_object);
 535         //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
 536
 537         if (!IP_VALID(memory_manager_default) &&
 538                 (m->object->purgable == VM_PURGABLE_DENY ||
 539                  m->object->purgable == VM_PURGABLE_NONVOLATILE ||
 540                  m->object->purgable == VM_PURGABLE_VOLATILE )) {
 541                 vm_page_lock_queues();
 542
 543                 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 544                 m->throttled = TRUE;
 545                 vm_page_throttled_count++;
 546
 547                 vm_page_unlock_queues();
 548         } else {
 549                 if (m->object->size > VM_ZF_OBJECT_SIZE_THRESHOLD) {
 550                         m->zero_fill = TRUE;
 551                         OSAddAtomic(1, (SInt32 *)&vm_zf_count);
 552                 }
 553         }
 554         return (my_fault);
 555 }
 556
 557
 558 /*
 559  *      Routine:        vm_fault_page
 560  *      Purpose:
 561  *              Find the resident page for the virtual memory
 562  *              specified by the given virtual memory object
 563  *              and offset.
 564  *      Additional arguments:
 565  *              The required permissions for the page is given
 566  *              in "fault_type".  Desired permissions are included
 567  *              in "protection".
 568  *              fault_info is passed along to determine pagein cluster
 569  *              limits... it contains the expected reference pattern,
 570  *              cluster size if available, etc...
 571  *
 572  *              If the desired page is known to be resident (for
 573  *              example, because it was previously wired down), asserting
 574  *              the "unwiring" parameter will speed the search.
 575  *
 576  *              If the operation can be interrupted (by thread_abort
 577  *              or thread_terminate), then the "interruptible"
 578  *              parameter should be asserted.
 579  *
 580  *      Results:
 581  *              The page containing the proper data is returned
 582  *              in "result_page".
 583  *
 584  *      In/out conditions:
 585  *              The source object must be locked and referenced,
 586  *              and must donate one paging reference.  The reference
 587  *              is not affected.  The paging reference and lock are
 588  *              consumed.
 589  *
 590  *              If the call succeeds, the object in which "result_page"
 591  *              resides is left locked and holding a paging reference.
 592  *              If this is not the original object, a busy page in the
 593  *              original object is returned in "top_page", to prevent other
 594  *              callers from pursuing this same data, along with a paging
 595  *              reference for the original object.  The "top_page" should
 596  *              be destroyed when this guarantee is no longer required.
 597  *              The "result_page" is also left busy.  It is not removed
 598  *              from the pageout queues.
 599  */
 600
 601 vm_fault_return_t
 602 vm_fault_page(
 603         /* Arguments: */
 604         vm_object_t     first_object,   /* Object to begin search */
 605         vm_object_offset_t first_offset,        /* Offset into object */
 606         vm_prot_t       fault_type,     /* What access is requested */
 607         boolean_t       must_be_resident,/* Must page be resident? */
 608         /* Modifies in place: */
 609         vm_prot_t       *protection,    /* Protection for mapping */
 610         /* Returns: */
 611         vm_page_t       *result_page,   /* Page found, if successful */
 612         vm_page_t       *top_page,      /* Page in top object, if
 613                                          * not result_page.  */
 614         int             *type_of_fault, /* if non-null, fill in with type of fault
 615                                          * COW, zero-fill, etc... returned in trace point */
 616         /* More arguments: */
 617         kern_return_t   *error_code,    /* code if page is in error */
 618         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 619 #if MACH_PAGEMAP
 620         boolean_t       data_supply,    /* treat as data_supply if
 621                                          * it is a write fault and a full
 622                                          * page is provided */
 623 #else
 624         __unused boolean_t data_supply,
 625 #endif
 626         vm_object_fault_info_t fault_info)
 627 {
 628         vm_page_t               m;
 629         vm_object_t             object;
 630         vm_object_offset_t      offset;
 631         vm_page_t               first_m;
 632         vm_object_t             next_object;
 633         vm_object_t             copy_object;
 634         boolean_t               look_for_page;
 635         vm_prot_t               access_required = fault_type;
 636         vm_prot_t               wants_copy_flag;
 637         CLUSTER_STAT(int pages_at_higher_offsets;)
 638         CLUSTER_STAT(int pages_at_lower_offsets;)
 639         kern_return_t           wait_result;
 640         boolean_t               interruptible_state;
 641         vm_fault_return_t       error;
 642         int                     my_fault;
 643         uint32_t                try_failed_count;
 644         int                     interruptible; /* how may fault be interrupted? */
 645         memory_object_t         pager;
 646
 647 /*
 648  * MACH page map - an optional optimization where a bit map is maintained
 649  * by the VM subsystem for internal objects to indicate which pages of
 650  * the object currently reside on backing store.  This existence map
 651  * duplicates information maintained by the vnode pager.  It is
 652  * created at the time of the first pageout against the object, i.e.
 653  * at the same time pager for the object is created.  The optimization
 654  * is designed to eliminate pager interaction overhead, if it is
 655  * 'known' that the page does not exist on backing store.
 656  *
 657  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 658  * either marked as paged out in the existence map for the object or no
 659  * existence map exists for the object.  MUST_ASK_PAGER() is one of the
 660  * criteria in the decision to invoke the pager.   It is also used as one
 661  * of the criteria to terminate the scan for adjacent pages in a clustered
 662  * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
 663  * permanent objects.  Note also that if the pager for an internal object
 664  * has not been created, the pager is not invoked regardless of the value
 665  * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
 666  * for which a pager has been created.
 667  *
 668  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 669  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 670  * PAGED_OUT() is used to determine if a page has already been pushed
 671  * into a copy object in order to avoid a redundant page out operation.
 672  */
 673 #if MACH_PAGEMAP
 674 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 675                         != VM_EXTERNAL_STATE_ABSENT)
 676 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 677                         == VM_EXTERNAL_STATE_EXISTS)
 678 #else
 679 #define MUST_ASK_PAGER(o, f) (TRUE)
 680 #define PAGED_OUT(o, f) (FALSE)
 681 #endif
 682
 683 /*
 684  *      Recovery actions
 685  */
 686 #define PREPARE_RELEASE_PAGE(m)                         \
 687         MACRO_BEGIN                                     \
 688         vm_page_lock_queues();                          \
 689         MACRO_END
 690
 691 #define DO_RELEASE_PAGE(m)                              \
 692         MACRO_BEGIN                                     \
 693         PAGE_WAKEUP_DONE(m);                            \
 694         if (!m->active && !m->inactive && !m->throttled)\
 695                 vm_page_activate(m);                    \
 696         vm_page_unlock_queues();                        \
 697         MACRO_END
 698
 699 #define RELEASE_PAGE(m)                                 \
 700         MACRO_BEGIN                                     \
 701         PREPARE_RELEASE_PAGE(m);                        \
 702         DO_RELEASE_PAGE(m);                             \
 703         MACRO_END
 704
 705 #if TRACEFAULTPAGE
 706         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 707 #endif
 708
 709
 710 #if     MACH_KDB
 711                 /*
 712                  *      If there are watchpoints set, then
 713                  *      we don't want to give away write permission
 714                  *      on a read fault.  Make the task write fault,
 715                  *      so that the watchpoint code notices the access.
 716                  */
 717             if (db_watchpoint_list) {
 718                 /*
 719                  *      If we aren't asking for write permission,
 720                  *      then don't give it away.  We're using write
 721                  *      faults to set the dirty bit.
 722                  */
 723                 if (!(fault_type & VM_PROT_WRITE))
 724                         *protection &= ~VM_PROT_WRITE;
 725         }
 726 #endif  /* MACH_KDB */
 727
 728         interruptible = fault_info->interruptible;
 729         interruptible_state = thread_interrupt_level(interruptible);
 730
 731         /*
 732          *      INVARIANTS (through entire routine):
 733          *
 734          *      1)      At all times, we must either have the object
 735          *              lock or a busy page in some object to prevent
 736          *              some other thread from trying to bring in
 737          *              the same page.
 738          *
 739          *              Note that we cannot hold any locks during the
 740          *              pager access or when waiting for memory, so
 741          *              we use a busy page then.
 742          *
 743          *      2)      To prevent another thread from racing us down the
 744          *              shadow chain and entering a new page in the top
 745          *              object before we do, we must keep a busy page in
 746          *              the top object while following the shadow chain.
 747          *
 748          *      3)      We must increment paging_in_progress on any object
 749          *              for which we have a busy page before dropping
 750          *              the object lock
 751          *
 752          *      4)      We leave busy pages on the pageout queues.
 753          *              If the pageout daemon comes across a busy page,
 754          *              it will remove the page from the pageout queues.
 755          */
 756
 757         object = first_object;
 758         offset = first_offset;
 759         first_m = VM_PAGE_NULL;
 760         access_required = fault_type;
 761
 762
 763         XPR(XPR_VM_FAULT,
 764                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 765                 (integer_t)object, offset, fault_type, *protection, 0);
 766
 767         /*
 768          * default type of fault
 769          */
 770         my_fault = DBG_CACHE_HIT_FAULT;
 771
 772         while (TRUE) {
 773 #if TRACEFAULTPAGE
 774                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 775 #endif
 776                 if (!object->alive) {
 777                         /*
 778                          * object is no longer valid
 779                          * clean up and return error
 780                          */
 781                         vm_fault_cleanup(object, first_m);
 782                         thread_interrupt_level(interruptible_state);
 783
 784                         return (VM_FAULT_MEMORY_ERROR);
 785                 }
 786
 787                 /*
 788                  * See whether the page at 'offset' is resident
 789                  */
 790                 m = vm_page_lookup(object, offset);
 791 #if TRACEFAULTPAGE
 792                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 793 #endif
 794                 if (m != VM_PAGE_NULL) {
 795
 796                         if (m->busy) {
 797                                 /*
 798                                  * The page is being brought in,
 799                                  * wait for it and then retry.
 800                                  *
 801                                  * A possible optimization: if the page
 802                                  * is known to be resident, we can ignore
 803                                  * pages that are absent (regardless of
 804                                  * whether they're busy).
 805                                  */
 806 #if TRACEFAULTPAGE
 807                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 808 #endif
 809                                 wait_result = PAGE_SLEEP(object, m, interruptible);
 810                                 XPR(XPR_VM_FAULT,
 811                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 812                                         (integer_t)object, offset,
 813                                         (integer_t)m, 0, 0);
 814                                 counter(c_vm_fault_page_block_busy_kernel++);
 815
 816                                 if (wait_result != THREAD_AWAKENED) {
 817                                         vm_fault_cleanup(object, first_m);
 818                                         thread_interrupt_level(interruptible_state);
 819
 820                                         if (wait_result == THREAD_RESTART)
 821                                                 return (VM_FAULT_RETRY);
 822                                         else
 823                                                 return (VM_FAULT_INTERRUPTED);
 824                                 }
 825                                 continue;
 826                         }
 827
 828                         if (m->phys_page == vm_page_guard_addr) {
 829                                 /*
 830                                  * Guard page: off limits !
 831                                  */
 832                                 if (fault_type == VM_PROT_NONE) {
 833                                         /*
 834                                          * The fault is not requesting any
 835                                          * access to the guard page, so it must
 836                                          * be just to wire or unwire it.
 837                                          * Let's pretend it succeeded...
 838                                          */
 839                                         m->busy = TRUE;
 840                                         *result_page = m;
 841                                         assert(first_m == VM_PAGE_NULL);
 842                                         *top_page = first_m;
 843                                         if (type_of_fault)
 844                                                 *type_of_fault = DBG_GUARD_FAULT;
 845                                         return VM_FAULT_SUCCESS;
 846                                 } else {
 847                                         /*
 848                                          * The fault requests access to the
 849                                          * guard page: let's deny that !
 850                                          */
 851                                         vm_fault_cleanup(object, first_m);
 852                                         thread_interrupt_level(interruptible_state);
 853                                         return VM_FAULT_MEMORY_ERROR;
 854                                 }
 855                         }
 856
 857                         if (m->error) {
 858                                 /*
 859                                  * The page is in error, give up now.
 860                                  */
 861 #if TRACEFAULTPAGE
 862                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
 863 #endif
 864                                 if (error_code)
 865                                         *error_code = KERN_MEMORY_ERROR;
 866                                 VM_PAGE_FREE(m);
 867
 868                                 vm_fault_cleanup(object, first_m);
 869                                 thread_interrupt_level(interruptible_state);
 870
 871                                 return (VM_FAULT_MEMORY_ERROR);
 872                         }
 873                         if (m->restart) {
 874                                 /*
 875                                  * The pager wants us to restart
 876                                  * at the top of the chain,
 877                                  * typically because it has moved the
 878                                  * page to another pager, then do so.
 879                                  */
 880 #if TRACEFAULTPAGE
 881                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 882 #endif
 883                                 VM_PAGE_FREE(m);
 884
 885                                 vm_fault_cleanup(object, first_m);
 886                                 thread_interrupt_level(interruptible_state);
 887
 888                                 return (VM_FAULT_RETRY);
 889                         }
 890                         if (m->absent) {
 891                                 /*
 892                                  * The page isn't busy, but is absent,
 893                                  * therefore it's deemed "unavailable".
 894                                  *
 895                                  * Remove the non-existent page (unless it's
 896                                  * in the top object) and move on down to the
 897                                  * next object (if there is one).
 898                                  */
 899 #if TRACEFAULTPAGE
 900                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
 901 #endif
 902                                 next_object = object->shadow;
 903
 904                                 if (next_object == VM_OBJECT_NULL) {
 905                                         /*
 906                                          * Absent page at bottom of shadow
 907                                          * chain; zero fill the page we left
 908                                          * busy in the first object, and free
 909                                          * the absent page.
 910                                          */
 911                                         assert(!must_be_resident);
 912
 913                                         /*
 914                                          * check for any conditions that prevent
 915                                          * us from creating a new zero-fill page
 916                                          * vm_fault_check will do all of the
 917                                          * fault cleanup in the case of an error condition
 918                                          * including resetting the thread_interrupt_level
 919                                          */
 920                                         error = vm_fault_check(object, m, first_m, interruptible_state);
 921
 922                                         if (error != VM_FAULT_SUCCESS)
 923                                                 return (error);
 924
 925                                         XPR(XPR_VM_FAULT,
 926                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
 927                                                 (integer_t)object, offset,
 928                                                 (integer_t)m,
 929                                                 (integer_t)first_object, 0);
 930
 931                                         if (object != first_object) {
 932                                                 /*
 933                                                  * free the absent page we just found
 934                                                  */
 935                                                 VM_PAGE_FREE(m);
 936
 937                                                 /*
 938                                                  * drop reference and lock on current object
 939                                                  */
 940                                                 vm_object_paging_end(object);
 941                                                 vm_object_unlock(object);
 942
 943                                                 /*
 944                                                  * grab the original page we
 945                                                  * 'soldered' in place and
 946                                                  * retake lock on 'first_object'
 947                                                  */
 948                                                 m = first_m;
 949                                                 first_m = VM_PAGE_NULL;
 950
 951                                                 object = first_object;
 952                                                 offset = first_offset;
 953
 954                                                 vm_object_lock(object);
 955                                         } else {
 956                                                 /*
 957                                                  * we're going to use the absent page we just found
 958                                                  * so convert it to a 'busy' page
 959                                                  */
 960                                                 m->absent = FALSE;
 961                                                 m->busy = TRUE;
 962                                         }
 963                                         /*
 964                                          * zero-fill the page and put it on
 965                                          * the correct paging queue
 966                                          */
 967                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
 968
 969                                         break;
 970                                 } else {
 971                                         if (must_be_resident)
 972                                                 vm_object_paging_end(object);
 973                                         else if (object != first_object) {
 974                                                 vm_object_paging_end(object);
 975                                                 VM_PAGE_FREE(m);
 976                                         } else {
 977                                                 first_m = m;
 978                                                 m->absent = FALSE;
 979                                                 m->busy = TRUE;
 980
 981                                                 vm_page_lockspin_queues();
 982                                                 VM_PAGE_QUEUES_REMOVE(m);
 983                                                 vm_page_unlock_queues();
 984                                         }
 985                                         XPR(XPR_VM_FAULT,
 986                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
 987                                                 (integer_t)object, offset,
 988                                                 (integer_t)next_object,
 989                                                 offset+object->shadow_offset,0);
 990
 991                                         offset += object->shadow_offset;
 992                                         fault_info->lo_offset += object->shadow_offset;
 993                                         fault_info->hi_offset += object->shadow_offset;
 994                                         access_required = VM_PROT_READ;
 995
 996                                         vm_object_lock(next_object);
 997                                         vm_object_unlock(object);
 998                                         object = next_object;
 999                                         vm_object_paging_begin(object);
1000
1001                                         /*
1002                                          * reset to default type of fault
1003                                          */
1004                                         my_fault = DBG_CACHE_HIT_FAULT;
1005
1006                                         continue;
1007                                 }
1008                         }
1009                         if ((m->cleaning)
1010                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1011                             && (fault_type & VM_PROT_WRITE)) {
1012                                 /*
1013                                  * This is a copy-on-write fault that will
1014                                  * cause us to revoke access to this page, but
1015                                  * this page is in the process of being cleaned
1016                                  * in a clustered pageout. We must wait until
1017                                  * the cleaning operation completes before
1018                                  * revoking access to the original page,
1019                                  * otherwise we might attempt to remove a
1020                                  * wired mapping.
1021                                  */
1022 #if TRACEFAULTPAGE
1023                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1024 #endif
1025                                 XPR(XPR_VM_FAULT,
1026                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1027                                         (integer_t)object, offset,
1028                                         (integer_t)m, 0, 0);
1029                                 /*
1030                                  * take an extra ref so that object won't die
1031                                  */
1032                                 vm_object_reference_locked(object);
1033
1034                                 vm_fault_cleanup(object, first_m);
1035
1036                                 counter(c_vm_fault_page_block_backoff_kernel++);
1037                                 vm_object_lock(object);
1038                                 assert(object->ref_count > 0);
1039
1040                                 m = vm_page_lookup(object, offset);
1041
1042                                 if (m != VM_PAGE_NULL && m->cleaning) {
1043                                         PAGE_ASSERT_WAIT(m, interruptible);
1044
1045                                         vm_object_unlock(object);
1046                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1047                                         vm_object_deallocate(object);
1048
1049                                         goto backoff;
1050                                 } else {
1051                                         vm_object_unlock(object);
1052
1053                                         vm_object_deallocate(object);
1054                                         thread_interrupt_level(interruptible_state);
1055
1056                                         return (VM_FAULT_RETRY);
1057                                 }
1058                         }
1059                         if (type_of_fault == NULL && m->speculative) {
1060                                 /*
1061                                  * If we were passed a non-NULL pointer for
1062                                  * "type_of_fault", than we came from
1063                                  * vm_fault... we'll let it deal with
1064                                  * this condition, since it
1065                                  * needs to see m->speculative to correctly
1066                                  * account the pageins, otherwise...
1067                                  * take it off the speculative queue, we'll
1068                                  * let the caller of vm_fault_page deal
1069                                  * with getting it onto the correct queue
1070                                  */
1071                                 vm_page_lockspin_queues();
1072                                 VM_PAGE_QUEUES_REMOVE(m);
1073                                 vm_page_unlock_queues();
1074                         }
1075
1076                         if (m->encrypted) {
1077                                 /*
1078                                  * ENCRYPTED SWAP:
1079                                  * the user needs access to a page that we
1080                                  * encrypted before paging it out.
1081                                  * Decrypt the page now.
1082                                  * Keep it busy to prevent anyone from
1083                                  * accessing it during the decryption.
1084                                  */
1085                                 m->busy = TRUE;
1086                                 vm_page_decrypt(m, 0);
1087                                 assert(object == m->object);
1088                                 assert(m->busy);
1089                                 PAGE_WAKEUP_DONE(m);
1090
1091                                 /*
1092                                  * Retry from the top, in case
1093                                  * something changed while we were
1094                                  * decrypting.
1095                                  */
1096                                 continue;
1097                         }
1098                         ASSERT_PAGE_DECRYPTED(m);
1099
1100                         if (m->object->code_signed) {
1101                                 /*
1102                                  * CODE SIGNING:
1103                                  * We just paged in a page from a signed
1104                                  * memory object but we don't need to
1105                                  * validate it now.  We'll validate it if
1106                                  * when it gets mapped into a user address
1107                                  * space for the first time or when the page
1108                                  * gets copied to another object as a result
1109                                  * of a copy-on-write.
1110                                  */
1111                         }
1112
1113                         /*
1114                          * We mark the page busy and leave it on
1115                          * the pageout queues.  If the pageout
1116                          * deamon comes across it, then it will
1117                          * remove the page from the queue, but not the object
1118                          */
1119 #if TRACEFAULTPAGE
1120                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1121 #endif
1122                         XPR(XPR_VM_FAULT,
1123                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1124                                 (integer_t)object, offset, (integer_t)m, 0, 0);
1125                         assert(!m->busy);
1126                         assert(!m->absent);
1127
1128                         m->busy = TRUE;
1129                         break;
1130                 }
1131
1132
1133                 /*
1134                  * we get here when there is no page present in the object at
1135                  * the offset we're interested in... we'll allocate a page
1136                  * at this point if the pager associated with
1137                  * this object can provide the data or we're the top object...
1138                  * object is locked;  m == NULL
1139                  */
1140                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1141
1142 #if TRACEFAULTPAGE
1143                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1144 #endif
1145                 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) {
1146                         /*
1147                          * Allocate a new page for this object/offset pair
1148                          */
1149                         m = vm_page_grab();
1150 #if TRACEFAULTPAGE
1151                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1152 #endif
1153                         if (m == VM_PAGE_NULL) {
1154
1155                                 vm_fault_cleanup(object, first_m);
1156                                 thread_interrupt_level(interruptible_state);
1157
1158                                 return (VM_FAULT_MEMORY_SHORTAGE);
1159                         }
1160                         vm_page_insert(m, object, offset);
1161                 }
1162                 if (look_for_page && !must_be_resident) {
1163                         kern_return_t   rc;
1164
1165                         /*
1166                          *      If the memory manager is not ready, we
1167                          *      cannot make requests.
1168                          */
1169                         if (!object->pager_ready) {
1170 #if TRACEFAULTPAGE
1171                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1172 #endif
1173                                 if (m != VM_PAGE_NULL)
1174                                         VM_PAGE_FREE(m);
1175
1176                                 XPR(XPR_VM_FAULT,
1177                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1178                                         (integer_t)object, offset, 0, 0, 0);
1179
1180                                 /*
1181                                  * take an extra ref so object won't die
1182                                  */
1183                                 vm_object_reference_locked(object);
1184                                 vm_fault_cleanup(object, first_m);
1185                                 counter(c_vm_fault_page_block_backoff_kernel++);
1186
1187                                 vm_object_lock(object);
1188                                 assert(object->ref_count > 0);
1189
1190                                 if (!object->pager_ready) {
1191                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1192
1193                                         vm_object_unlock(object);
1194                                         if (wait_result == THREAD_WAITING)
1195                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1196                                         vm_object_deallocate(object);
1197
1198                                         goto backoff;
1199                                 } else {
1200                                         vm_object_unlock(object);
1201                                         vm_object_deallocate(object);
1202                                         thread_interrupt_level(interruptible_state);
1203
1204                                         return (VM_FAULT_RETRY);
1205                                 }
1206                         }
1207                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1208                                 /*
1209                                  * If there are too many outstanding page
1210                                  * requests pending on this external object, we
1211                                  * wait for them to be resolved now.
1212                                  */
1213 #if TRACEFAULTPAGE
1214                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1215 #endif
1216                                 if (m != VM_PAGE_NULL)
1217                                         VM_PAGE_FREE(m);
1218                                 /*
1219                                  * take an extra ref so object won't die
1220                                  */
1221                                 vm_object_reference_locked(object);
1222
1223                                 vm_fault_cleanup(object, first_m);
1224
1225                                 counter(c_vm_fault_page_block_backoff_kernel++);
1226
1227                                 vm_object_lock(object);
1228                                 assert(object->ref_count > 0);
1229
1230                                 if (object->paging_in_progress > vm_object_pagein_throttle) {
1231                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible);
1232
1233                                         vm_object_unlock(object);
1234                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1235                                         vm_object_deallocate(object);
1236
1237                                         goto backoff;
1238                                 } else {
1239                                         vm_object_unlock(object);
1240                                         vm_object_deallocate(object);
1241                                         thread_interrupt_level(interruptible_state);
1242
1243                                         return (VM_FAULT_RETRY);
1244                                 }
1245                         }
1246                         if (m != VM_PAGE_NULL) {
1247                                 /*
1248                                  * Indicate that the page is waiting for data
1249                                  * from the memory manager.
1250                                  */
1251                                 m->list_req_pending = TRUE;
1252                                 m->absent = TRUE;
1253                         }
1254
1255 #if TRACEFAULTPAGE
1256                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1257 #endif
1258
1259                         /*
1260                          * It's possible someone called vm_object_destroy while we weren't
1261                          * holding the object lock.  If that has happened, then bail out
1262                          * here.
1263                          */
1264
1265                         pager = object->pager;
1266
1267                         if (pager == MEMORY_OBJECT_NULL) {
1268                                 vm_fault_cleanup(object, first_m);
1269                                 thread_interrupt_level(interruptible_state);
1270                                 return VM_FAULT_MEMORY_ERROR;
1271                         }
1272
1273                         /*
1274                          * We have an absent page in place for the faulting offset,
1275                          * so we can release the object lock.
1276                          */
1277
1278                         vm_object_unlock(object);
1279
1280                         /*
1281                          * If this object uses a copy_call strategy,
1282                          * and we are interested in a copy of this object
1283                          * (having gotten here only by following a
1284                          * shadow chain), then tell the memory manager
1285                          * via a flag added to the desired_access
1286                          * parameter, so that it can detect a race
1287                          * between our walking down the shadow chain
1288                          * and its pushing pages up into a copy of
1289                          * the object that it manages.
1290                          */
1291                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1292                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1293                         else
1294                                 wants_copy_flag = VM_PROT_NONE;
1295
1296                         XPR(XPR_VM_FAULT,
1297                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1298                                 (integer_t)object, offset, (integer_t)m,
1299                                 access_required | wants_copy_flag, 0);
1300
1301                         /*
1302                          * Call the memory manager to retrieve the data.
1303                          */
1304                         rc = memory_object_data_request(
1305                                 pager,
1306                                 offset + object->paging_offset,
1307                                 PAGE_SIZE,
1308                                 access_required | wants_copy_flag,
1309                                 (memory_object_fault_info_t)fault_info);
1310
1311 #if TRACEFAULTPAGE
1312                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1313 #endif
1314                         vm_object_lock(object);
1315
1316                         if (rc != KERN_SUCCESS) {
1317
1318                                 vm_fault_cleanup(object, first_m);
1319                                 thread_interrupt_level(interruptible_state);
1320
1321                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1322                                         VM_FAULT_INTERRUPTED :
1323                                         VM_FAULT_MEMORY_ERROR);
1324                         }
1325                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) {
1326
1327                                 vm_fault_cleanup(object, first_m);
1328                                 thread_interrupt_level(interruptible_state);
1329
1330                                 return (VM_FAULT_INTERRUPTED);
1331                         }
1332                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1333                                 /*
1334                                  * No page here means that the object we
1335                                  * initially looked up was "physically
1336                                  * contiguous" (i.e. device memory).  However,
1337                                  * with Virtual VRAM, the object might not
1338                                  * be backed by that device memory anymore,
1339                                  * so we're done here only if the object is
1340                                  * still "phys_contiguous".
1341                                  * Otherwise, if the object is no longer
1342                                  * "phys_contiguous", we need to retry the
1343                                  * page fault against the object's new backing
1344                                  * store (different memory object).
1345                                  */
1346                                 break;
1347                         }
1348                         /*
1349                          * potentially a pagein fault
1350                          * if we make it through the state checks
1351                          * above, than we'll count it as such
1352                          */
1353                         my_fault = DBG_PAGEIN_FAULT;
1354
1355                         /*
1356                          * Retry with same object/offset, since new data may
1357                          * be in a different page (i.e., m is meaningless at
1358                          * this point).
1359                          */
1360                         continue;
1361                 }
1362
1363                 /*
1364                  * We get here if the object has no pager, or an existence map
1365                  * exists and indicates the page isn't present on the pager
1366                  * or we're unwiring a page.  If a pager exists, but there
1367                  * is no existence map, then the m->absent case above handles
1368                  * the ZF case when the pager can't provide the page
1369                  */
1370 #if TRACEFAULTPAGE
1371                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1372 #endif
1373                 if (object == first_object)
1374                         first_m = m;
1375                 else
1376                         assert(m == VM_PAGE_NULL);
1377
1378                 XPR(XPR_VM_FAULT,
1379                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1380                         (integer_t)object, offset, (integer_t)m,
1381                         (integer_t)object->shadow, 0);
1382
1383                 next_object = object->shadow;
1384
1385                 if (next_object == VM_OBJECT_NULL) {
1386                         /*
1387                          * we've hit the bottom of the shadown chain,
1388                          * fill the page in the top object with zeros.
1389                          */
1390                         assert(!must_be_resident);
1391
1392                         if (object != first_object) {
1393                                 vm_object_paging_end(object);
1394                                 vm_object_unlock(object);
1395
1396                                 object = first_object;
1397                                 offset = first_offset;
1398                                 vm_object_lock(object);
1399                         }
1400                         m = first_m;
1401                         assert(m->object == object);
1402                         first_m = VM_PAGE_NULL;
1403
1404                         /*
1405                          * check for any conditions that prevent
1406                          * us from creating a new zero-fill page
1407                          * vm_fault_check will do all of the
1408                          * fault cleanup in the case of an error condition
1409                          * including resetting the thread_interrupt_level
1410                          */
1411                         error = vm_fault_check(object, m, first_m, interruptible_state);
1412
1413                         if (error != VM_FAULT_SUCCESS)
1414                                 return (error);
1415
1416                         if (m == VM_PAGE_NULL) {
1417                                 m = vm_page_grab();
1418
1419                                 if (m == VM_PAGE_NULL) {
1420                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1421                                         thread_interrupt_level(interruptible_state);
1422
1423                                         return (VM_FAULT_MEMORY_SHORTAGE);
1424                                 }
1425                                 vm_page_insert(m, object, offset);
1426                         }
1427                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1428
1429                         break;
1430
1431                 } else {
1432                         /*
1433                          * Move on to the next object.  Lock the next
1434                          * object before unlocking the current one.
1435                          */
1436                         if ((object != first_object) || must_be_resident)
1437                                 vm_object_paging_end(object);
1438
1439                         offset += object->shadow_offset;
1440                         fault_info->lo_offset += object->shadow_offset;
1441                         fault_info->hi_offset += object->shadow_offset;
1442                         access_required = VM_PROT_READ;
1443
1444                         vm_object_lock(next_object);
1445                         vm_object_unlock(object);
1446
1447                         object = next_object;
1448                         vm_object_paging_begin(object);
1449                 }
1450         }
1451
1452         /*
1453          *      PAGE HAS BEEN FOUND.
1454          *
1455          *      This page (m) is:
1456          *              busy, so that we can play with it;
1457          *              not absent, so that nobody else will fill it;
1458          *              possibly eligible for pageout;
1459          *
1460          *      The top-level page (first_m) is:
1461          *              VM_PAGE_NULL if the page was found in the
1462          *               top-level object;
1463          *              busy, not absent, and ineligible for pageout.
1464          *
1465          *      The current object (object) is locked.  A paging
1466          *      reference is held for the current and top-level
1467          *      objects.
1468          */
1469
1470 #if TRACEFAULTPAGE
1471         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1472 #endif
1473 #if     EXTRA_ASSERTIONS
1474         if (m != VM_PAGE_NULL) {
1475                 assert(m->busy && !m->absent);
1476                 assert((first_m == VM_PAGE_NULL) ||
1477                         (first_m->busy && !first_m->absent &&
1478                          !first_m->active && !first_m->inactive));
1479         }
1480 #endif  /* EXTRA_ASSERTIONS */
1481
1482         /*
1483          * ENCRYPTED SWAP:
1484          * If we found a page, we must have decrypted it before we
1485          * get here...
1486          */
1487         if (m != VM_PAGE_NULL) {
1488                 ASSERT_PAGE_DECRYPTED(m);
1489         }
1490
1491         XPR(XPR_VM_FAULT,
1492             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1493                 (integer_t)object, offset, (integer_t)m,
1494                 (integer_t)first_object, (integer_t)first_m);
1495
1496         /*
1497          * If the page is being written, but isn't
1498          * already owned by the top-level object,
1499          * we have to copy it into a new page owned
1500          * by the top-level object.
1501          */
1502         if ((object != first_object) && (m != VM_PAGE_NULL)) {
1503
1504 #if TRACEFAULTPAGE
1505                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1506 #endif
1507                 if (fault_type & VM_PROT_WRITE) {
1508                         vm_page_t copy_m;
1509
1510                         /*
1511                          * We only really need to copy if we
1512                          * want to write it.
1513                          */
1514                         assert(!must_be_resident);
1515
1516                         /*
1517                          * are we protecting the system from
1518                          * backing store exhaustion.  If so
1519                          * sleep unless we are privileged.
1520                          */
1521                         if (vm_backing_store_low) {
1522                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1523
1524                                         RELEASE_PAGE(m);
1525                                         vm_fault_cleanup(object, first_m);
1526
1527                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1528
1529                                         thread_block(THREAD_CONTINUE_NULL);
1530                                         thread_interrupt_level(interruptible_state);
1531
1532                                         return (VM_FAULT_RETRY);
1533                                 }
1534                         }
1535                         /*
1536                          * If we try to collapse first_object at this
1537                          * point, we may deadlock when we try to get
1538                          * the lock on an intermediate object (since we
1539                          * have the bottom object locked).  We can't
1540                          * unlock the bottom object, because the page
1541                          * we found may move (by collapse) if we do.
1542                          *
1543                          * Instead, we first copy the page.  Then, when
1544                          * we have no more use for the bottom object,
1545                          * we unlock it and try to collapse.
1546                          *
1547                          * Note that we copy the page even if we didn't
1548                          * need to... that's the breaks.
1549                          */
1550
1551                         /*
1552                          * Allocate a page for the copy
1553                          */
1554                         copy_m = vm_page_grab();
1555
1556                         if (copy_m == VM_PAGE_NULL) {
1557                                 RELEASE_PAGE(m);
1558
1559                                 vm_fault_cleanup(object, first_m);
1560                                 thread_interrupt_level(interruptible_state);
1561
1562                                 return (VM_FAULT_MEMORY_SHORTAGE);
1563                         }
1564                         XPR(XPR_VM_FAULT,
1565                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1566                                 (integer_t)object, offset,
1567                                 (integer_t)m, (integer_t)copy_m, 0);
1568
1569                         vm_page_copy(m, copy_m);
1570
1571                         /*
1572                          * If another map is truly sharing this
1573                          * page with us, we have to flush all
1574                          * uses of the original page, since we
1575                          * can't distinguish those which want the
1576                          * original from those which need the
1577                          * new copy.
1578                          *
1579                          * XXXO If we know that only one map has
1580                          * access to this page, then we could
1581                          * avoid the pmap_disconnect() call.
1582                          */
1583                         if (m->pmapped)
1584                                 pmap_disconnect(m->phys_page);
1585
1586                         assert(!m->cleaning);
1587
1588                         /*
1589                          * We no longer need the old page or object.
1590                          */
1591                         PAGE_WAKEUP_DONE(m);
1592                         vm_object_paging_end(object);
1593                         vm_object_unlock(object);
1594
1595                         my_fault = DBG_COW_FAULT;
1596                         VM_STAT_INCR(cow_faults);
1597                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1598                         current_task()->cow_faults++;
1599
1600                         object = first_object;
1601                         offset = first_offset;
1602
1603                         vm_object_lock(object);
1604                         /*
1605                          * get rid of the place holder
1606                          * page that we soldered in earlier
1607                          */
1608                         VM_PAGE_FREE(first_m);
1609                         first_m = VM_PAGE_NULL;
1610
1611                         /*
1612                          * and replace it with the
1613                          * page we just copied into
1614                          */
1615                         assert(copy_m->busy);
1616                         vm_page_insert(copy_m, object, offset);
1617                         copy_m->dirty = TRUE;
1618
1619                         m = copy_m;
1620                         /*
1621                          * Now that we've gotten the copy out of the
1622                          * way, let's try to collapse the top object.
1623                          * But we have to play ugly games with
1624                          * paging_in_progress to do that...
1625                          */
1626                         vm_object_paging_end(object);
1627                         vm_object_collapse(object, offset, TRUE);
1628                         vm_object_paging_begin(object);
1629
1630                 } else
1631                         *protection &= (~VM_PROT_WRITE);
1632         }
1633         /*
1634          * Now check whether the page needs to be pushed into the
1635          * copy object.  The use of asymmetric copy on write for
1636          * shared temporary objects means that we may do two copies to
1637          * satisfy the fault; one above to get the page from a
1638          * shadowed object, and one here to push it into the copy.
1639          */
1640         try_failed_count = 0;
1641
1642         while ((copy_object = first_object->copy) != VM_OBJECT_NULL && (m != VM_PAGE_NULL)) {
1643                 vm_object_offset_t      copy_offset;
1644                 vm_page_t               copy_m;
1645
1646 #if TRACEFAULTPAGE
1647                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1648 #endif
1649                 /*
1650                  * If the page is being written, but hasn't been
1651                  * copied to the copy-object, we have to copy it there.
1652                  */
1653                 if ((fault_type & VM_PROT_WRITE) == 0) {
1654                         *protection &= ~VM_PROT_WRITE;
1655                         break;
1656                 }
1657
1658                 /*
1659                  * If the page was guaranteed to be resident,
1660                  * we must have already performed the copy.
1661                  */
1662                 if (must_be_resident)
1663                         break;
1664
1665                 /*
1666                  * Try to get the lock on the copy_object.
1667                  */
1668                 if (!vm_object_lock_try(copy_object)) {
1669
1670                         vm_object_unlock(object);
1671                         try_failed_count++;
1672
1673                         mutex_pause(try_failed_count);  /* wait a bit */
1674                         vm_object_lock(object);
1675
1676                         continue;
1677                 }
1678                 try_failed_count = 0;
1679
1680                 /*
1681                  * Make another reference to the copy-object,
1682                  * to keep it from disappearing during the
1683                  * copy.
1684                  */
1685                 vm_object_reference_locked(copy_object);
1686
1687                 /*
1688                  * Does the page exist in the copy?
1689                  */
1690                 copy_offset = first_offset - copy_object->shadow_offset;
1691
1692                 if (copy_object->size <= copy_offset)
1693                         /*
1694                          * Copy object doesn't cover this page -- do nothing.
1695                          */
1696                         ;
1697                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1698                         /*
1699                          * Page currently exists in the copy object
1700                          */
1701                         if (copy_m->busy) {
1702                                 /*
1703                                  * If the page is being brought
1704                                  * in, wait for it and then retry.
1705                                  */
1706                                 RELEASE_PAGE(m);
1707
1708                                 /*
1709                                  * take an extra ref so object won't die
1710                                  */
1711                                 vm_object_reference_locked(copy_object);
1712                                 vm_object_unlock(copy_object);
1713                                 vm_fault_cleanup(object, first_m);
1714                                 counter(c_vm_fault_page_block_backoff_kernel++);
1715
1716                                 vm_object_lock(copy_object);
1717                                 assert(copy_object->ref_count > 0);
1718                                 VM_OBJ_RES_DECR(copy_object);
1719                                 vm_object_lock_assert_exclusive(copy_object);
1720                                 copy_object->ref_count--;
1721                                 assert(copy_object->ref_count > 0);
1722                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1723                                 /*
1724                                  * ENCRYPTED SWAP:
1725                                  * it's OK if the "copy_m" page is encrypted,
1726                                  * because we're not moving it nor handling its
1727                                  * contents.
1728                                  */
1729                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1730                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1731
1732                                         vm_object_unlock(copy_object);
1733                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1734                                         vm_object_deallocate(copy_object);
1735
1736                                         goto backoff;
1737                                 } else {
1738                                         vm_object_unlock(copy_object);
1739                                         vm_object_deallocate(copy_object);
1740                                         thread_interrupt_level(interruptible_state);
1741
1742                                         return (VM_FAULT_RETRY);
1743                                 }
1744                         }
1745                 }
1746                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1747                         /*
1748                          * If PAGED_OUT is TRUE, then the page used to exist
1749                          * in the copy-object, and has already been paged out.
1750                          * We don't need to repeat this. If PAGED_OUT is
1751                          * FALSE, then either we don't know (!pager_created,
1752                          * for example) or it hasn't been paged out.
1753                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1754                          * We must copy the page to the copy object.
1755                          */
1756
1757                         if (vm_backing_store_low) {
1758                                 /*
1759                                  * we are protecting the system from
1760                                  * backing store exhaustion.  If so
1761                                  * sleep unless we are privileged.
1762                                  */
1763                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1764                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1765
1766                                         RELEASE_PAGE(m);
1767                                         VM_OBJ_RES_DECR(copy_object);
1768                                         vm_object_lock_assert_exclusive(copy_object);
1769                                         copy_object->ref_count--;
1770                                         assert(copy_object->ref_count > 0);
1771
1772                                         vm_object_unlock(copy_object);
1773                                         vm_fault_cleanup(object, first_m);
1774                                         thread_block(THREAD_CONTINUE_NULL);
1775                                         thread_interrupt_level(interruptible_state);
1776
1777                                         return (VM_FAULT_RETRY);
1778                                 }
1779                         }
1780                         /*
1781                          * Allocate a page for the copy
1782                          */
1783                         copy_m = vm_page_alloc(copy_object, copy_offset);
1784
1785                         if (copy_m == VM_PAGE_NULL) {
1786                                 RELEASE_PAGE(m);
1787
1788                                 VM_OBJ_RES_DECR(copy_object);
1789                                 vm_object_lock_assert_exclusive(copy_object);
1790                                 copy_object->ref_count--;
1791                                 assert(copy_object->ref_count > 0);
1792
1793                                 vm_object_unlock(copy_object);
1794                                 vm_fault_cleanup(object, first_m);
1795                                 thread_interrupt_level(interruptible_state);
1796
1797                                 return (VM_FAULT_MEMORY_SHORTAGE);
1798                         }
1799                         /*
1800                          * Must copy page into copy-object.
1801                          */
1802                         vm_page_copy(m, copy_m);
1803
1804                         /*
1805                          * If the old page was in use by any users
1806                          * of the copy-object, it must be removed
1807                          * from all pmaps.  (We can't know which
1808                          * pmaps use it.)
1809                          */
1810                         if (m->pmapped)
1811                                 pmap_disconnect(m->phys_page);
1812
1813                         /*
1814                          * If there's a pager, then immediately
1815                          * page out this page, using the "initialize"
1816                          * option.  Else, we use the copy.
1817                          */
1818                         if ((!copy_object->pager_created)
1819 #if MACH_PAGEMAP
1820                             || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
1821 #endif
1822                             ) {
1823
1824                                 vm_page_lockspin_queues();
1825                                 assert(!m->cleaning);
1826                                 vm_page_activate(copy_m);
1827                                 vm_page_unlock_queues();
1828
1829                                 copy_m->dirty = TRUE;
1830                                 PAGE_WAKEUP_DONE(copy_m);
1831                         }
1832                         else {
1833                                 assert(copy_m->busy == TRUE);
1834                                 assert(!m->cleaning);
1835
1836                                 /*
1837                                  * dirty is protected by the object lock
1838                                  */
1839                                 copy_m->dirty = TRUE;
1840
1841                                 /*
1842                                  * The page is already ready for pageout:
1843                                  * not on pageout queues and busy.
1844                                  * Unlock everything except the
1845                                  * copy_object itself.
1846                                  */
1847                                 vm_object_unlock(object);
1848
1849                                 /*
1850                                  * Write the page to the copy-object,
1851                                  * flushing it from the kernel.
1852                                  */
1853                                 vm_pageout_initialize_page(copy_m);
1854
1855                                 /*
1856                                  * Since the pageout may have
1857                                  * temporarily dropped the
1858                                  * copy_object's lock, we
1859                                  * check whether we'll have
1860                                  * to deallocate the hard way.
1861                                  */
1862                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
1863                                         vm_object_unlock(copy_object);
1864                                         vm_object_deallocate(copy_object);
1865                                         vm_object_lock(object);
1866
1867                                         continue;
1868                                 }
1869                                 /*
1870                                  * Pick back up the old object's
1871                                  * lock.  [It is safe to do so,
1872                                  * since it must be deeper in the
1873                                  * object tree.]
1874                                  */
1875                                 vm_object_lock(object);
1876                         }
1877                         /*
1878                          * Because we're pushing a page upward
1879                          * in the object tree, we must restart
1880                          * any faults that are waiting here.
1881                          * [Note that this is an expansion of
1882                          * PAGE_WAKEUP that uses the THREAD_RESTART
1883                          * wait result].  Can't turn off the page's
1884                          * busy bit because we're not done with it.
1885                          */
1886                         if (m->wanted) {
1887                                 m->wanted = FALSE;
1888                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
1889                         }
1890                 }
1891                 /*
1892                  * The reference count on copy_object must be
1893                  * at least 2: one for our extra reference,
1894                  * and at least one from the outside world
1895                  * (we checked that when we last locked
1896                  * copy_object).
1897                  */
1898                 vm_object_lock_assert_exclusive(copy_object);
1899                 copy_object->ref_count--;
1900                 assert(copy_object->ref_count > 0);
1901
1902                 VM_OBJ_RES_DECR(copy_object);
1903                 vm_object_unlock(copy_object);
1904
1905                 break;
1906         }
1907         *result_page = m;
1908         *top_page = first_m;
1909
1910         XPR(XPR_VM_FAULT,
1911                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1912                 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1913
1914         if (m != VM_PAGE_NULL) {
1915                 if (my_fault == DBG_PAGEIN_FAULT) {
1916
1917                         VM_STAT_INCR(pageins);
1918                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
1919                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1920                         current_task()->pageins++;
1921
1922                         if (m->object->internal) {
1923                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
1924                         } else {
1925                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
1926                         }
1927
1928                         /*
1929                          * evaluate access pattern and update state
1930                          * vm_fault_deactivate_behind depends on the
1931                          * state being up to date
1932                          */
1933                         vm_fault_is_sequential(object, offset, fault_info->behavior);
1934
1935                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
1936                 }
1937                 if (type_of_fault)
1938                         *type_of_fault = my_fault;
1939         } else
1940                 vm_object_unlock(object);
1941
1942         thread_interrupt_level(interruptible_state);
1943
1944 #if TRACEFAULTPAGE
1945         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
1946 #endif
1947         return (VM_FAULT_SUCCESS);
1948
1949 backoff:
1950         thread_interrupt_level(interruptible_state);
1951
1952         if (wait_result == THREAD_INTERRUPTED)
1953                 return (VM_FAULT_INTERRUPTED);
1954         return (VM_FAULT_RETRY);
1955
1956 #undef  RELEASE_PAGE
1957 }
1958
1959
1960
1961 /*
1962  * page queue lock must NOT be held
1963  * m->object must be locked
1964  *
1965  * NOTE: m->object could be locked "shared" only if we are called
1966  * from vm_fault() as part of a soft fault.  If so, we must be
1967  * careful not to modify the VM object in any way that is not
1968  * legal under a shared lock...
1969  */
1970 unsigned long cs_enter_tainted_rejected = 0;
1971 unsigned long cs_enter_tainted_accepted = 0;
1972 kern_return_t
1973 vm_fault_enter(vm_page_t m,
1974                pmap_t pmap,
1975                vm_map_offset_t vaddr,
1976                vm_prot_t prot,
1977                boolean_t wired,
1978                boolean_t change_wiring,
1979                boolean_t no_cache,
1980                int *type_of_fault)
1981 {
1982         unsigned int    cache_attr;
1983         kern_return_t   kr;
1984         boolean_t       previously_pmapped = m->pmapped;
1985
1986         vm_object_lock_assert_held(m->object);
1987 #if DEBUG
1988         mutex_assert(&vm_page_queue_lock, MA_NOTOWNED);
1989 #endif /* DEBUG */
1990
1991         if (m->phys_page == vm_page_guard_addr) {
1992                 assert(m->fictitious);
1993                 return KERN_SUCCESS;
1994         }
1995
1996         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
1997
1998         if (m->object->code_signed && pmap != kernel_pmap &&
1999             (!m->cs_validated || m->wpmapped)) {
2000                 vm_object_lock_assert_exclusive(m->object);
2001
2002                 if (m->cs_validated && m->wpmapped) {
2003                         vm_cs_revalidates++;
2004                 }
2005
2006                 /*
2007                  * CODE SIGNING:
2008                  * This page comes from a VM object backed by a signed
2009                  * memory object.  We are about to enter it into a process
2010                  * address space, so we need to validate its signature.
2011                  */
2012                 /* VM map is locked, so 1 ref will remain on VM object */
2013                 vm_page_validate_cs(m);
2014         }
2015
2016         if (m->pmapped == FALSE) {
2017                 /*
2018                  * This is the first time this page is being
2019                  * mapped in an address space (pmapped == FALSE).
2020                  *
2021                  * Part of that page may still be in the data cache
2022                  * and not flushed to memory.  In case we end up
2023                  * accessing that page via the instruction cache,
2024                  * we need to ensure that the 2 caches are in sync.
2025                  */
2026                 pmap_sync_page_data_phys(m->phys_page);
2027
2028                 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2029                         /*
2030                          * found it in the cache, but this
2031                          * is the first fault-in of the page (m->pmapped == FALSE)
2032                          * so it must have come in as part of
2033                          * a cluster... account 1 pagein against it
2034                          */
2035                         VM_STAT_INCR(pageins);
2036                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2037
2038                         if (m->object->internal) {
2039                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2040                         } else {
2041                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2042                         }
2043
2044                         current_task()->pageins++;
2045
2046                         *type_of_fault = DBG_PAGEIN_FAULT;
2047                 }
2048                 VM_PAGE_CONSUME_CLUSTERED(m);
2049
2050         } else if (cache_attr != VM_WIMG_DEFAULT)
2051                 pmap_sync_page_attributes_phys(m->phys_page);
2052
2053         if (*type_of_fault != DBG_COW_FAULT) {
2054                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2055
2056                 if (pmap == kernel_pmap) {
2057                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2058                 }
2059         }
2060
2061         if (m->cs_tainted) {
2062                 /*
2063                  * CODE SIGNING:
2064                  * This page has been tainted and can not be trusted.
2065                  * Let's notify the current process and let it take any
2066                  * necessary precautions before we enter the tainted page
2067                  * into its address space.
2068                  */
2069                 if (cs_invalid_page()) {
2070                         /* reject the tainted page: abort the page fault */
2071                         kr = KERN_MEMORY_ERROR;
2072                         cs_enter_tainted_rejected++;
2073                 } else {
2074                         /* proceed with the tainted page */
2075                         kr = KERN_SUCCESS;
2076                         cs_enter_tainted_accepted++;
2077                 }
2078                 if (cs_debug || kr != KERN_SUCCESS) {
2079                         printf("CODESIGNING: vm_fault_enter(0x%llx): "
2080                                "page %p obj %p off 0x%llx *** TAINTED ***\n",
2081                                (long long)vaddr, m, m->object, m->offset);
2082                 }
2083         } else {
2084                 /* proceed with the valid page */
2085                 kr = KERN_SUCCESS;
2086         }
2087
2088         if (kr == KERN_SUCCESS) {
2089                 /*
2090                  * NOTE: we may only hold the vm_object lock SHARED
2091                  * at this point, but the update of pmapped is ok
2092                  * since this is the ONLY bit updated behind the SHARED
2093                  * lock... however, we need to figure out how to do an atomic
2094                  * update on a bit field to make this less fragile... right
2095                  * now I don'w know how to coerce 'C' to give me the offset info
2096                  * that's needed for an AtomicCompareAndSwap
2097                  */
2098                 m->pmapped = TRUE;
2099                 if (prot & VM_PROT_WRITE) {
2100                         vm_object_lock_assert_exclusive(m->object);
2101                         m->wpmapped = TRUE;
2102                 }
2103
2104                 PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired);
2105         }
2106
2107         /*
2108          * Hold queues lock to manipulate
2109          * the page queues.  Change wiring
2110          * case is obvious.
2111          */
2112         if (change_wiring) {
2113                 vm_page_lockspin_queues();
2114
2115                 if (wired) {
2116                         if (kr == KERN_SUCCESS) {
2117                                 vm_page_wire(m);
2118                         }
2119                 } else {
2120                         vm_page_unwire(m);
2121                 }
2122                 vm_page_unlock_queues();
2123
2124         } else {
2125                 if (kr != KERN_SUCCESS) {
2126                         vm_page_lock_queues();
2127                         vm_page_deactivate(m);
2128                         vm_page_unlock_queues();
2129                 } else {
2130                         if (((!m->active && !m->inactive) || no_cache) && !m->wire_count && !m->throttled) {
2131                                 vm_page_lockspin_queues();
2132                                 /*
2133                                  * test again now that we hold the page queue lock
2134                                  */
2135                                 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count) {
2136
2137                                         /*
2138                                          * If this is a no_cache mapping and the page has never been
2139                                          * mapped before or was previously a no_cache page, then we
2140                                          * want to leave pages in the speculative state so that they
2141                                          * can be readily recycled if free memory runs low.  Otherwise
2142                                          * the page is activated as normal.
2143                                          */
2144
2145                                         if (no_cache && (!previously_pmapped || m->no_cache)) {
2146                                                 m->no_cache = TRUE;
2147
2148                                                 if (m->active || m->inactive)
2149                                                         VM_PAGE_QUEUES_REMOVE(m);
2150
2151                                                 if (!m->speculative)
2152                                                         vm_page_speculate(m, TRUE);
2153
2154                                         } else if (!m->active && !m->inactive)
2155                                                 vm_page_activate(m);
2156
2157                                 }
2158
2159                                 vm_page_unlock_queues();
2160                         }
2161                 }
2162         }
2163         return kr;
2164 }
2165
2166
2167 /*
2168  *      Routine:        vm_fault
2169  *      Purpose:
2170  *              Handle page faults, including pseudo-faults
2171  *              used to change the wiring status of pages.
2172  *      Returns:
2173  *              Explicit continuations have been removed.
2174  *      Implementation:
2175  *              vm_fault and vm_fault_page save mucho state
2176  *              in the moral equivalent of a closure.  The state
2177  *              structure is allocated when first entering vm_fault
2178  *              and deallocated when leaving vm_fault.
2179  */
2180
2181 extern int _map_enter_debug;
2182
2183 unsigned long vm_fault_collapse_total = 0;
2184 unsigned long vm_fault_collapse_skipped = 0;
2185
2186 kern_return_t
2187 vm_fault(
2188         vm_map_t        map,
2189         vm_map_offset_t vaddr,
2190         vm_prot_t       fault_type,
2191         boolean_t       change_wiring,
2192         int             interruptible,
2193         pmap_t          caller_pmap,
2194         vm_map_offset_t caller_pmap_addr)
2195 {
2196         vm_map_version_t        version;        /* Map version for verificiation */
2197         boolean_t               wired;          /* Should mapping be wired down? */
2198         vm_object_t             object;         /* Top-level object */
2199         vm_object_offset_t      offset;         /* Top-level offset */
2200         vm_prot_t               prot;           /* Protection for mapping */
2201         vm_object_t             old_copy_object; /* Saved copy object */
2202         vm_page_t               result_page;    /* Result of vm_fault_page */
2203         vm_page_t               top_page;       /* Placeholder page */
2204         kern_return_t           kr;
2205
2206         vm_page_t               m;      /* Fast access to result_page */
2207         kern_return_t           error_code;
2208         vm_object_t             cur_object;
2209         vm_object_offset_t      cur_offset;
2210         vm_page_t               cur_m;
2211         vm_object_t             new_object;
2212         int                     type_of_fault;
2213         pmap_t                  pmap;
2214         boolean_t               interruptible_state;
2215         vm_map_t                real_map = map;
2216         vm_map_t                original_map = map;
2217         vm_prot_t               original_fault_type;
2218         struct vm_object_fault_info fault_info;
2219         boolean_t               need_collapse = FALSE;
2220         int                     object_lock_type = 0;
2221         int                     cur_object_lock_type;
2222
2223
2224         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2225                               (int)((uint64_t)vaddr >> 32),
2226                               (int)vaddr,
2227                               0,
2228                               0,
2229                               0);
2230
2231         if (get_preemption_level() != 0) {
2232                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2233                                       (int)((uint64_t)vaddr >> 32),
2234                                       (int)vaddr,
2235                                       KERN_FAILURE,
2236                                       0,
2237                                       0);
2238
2239                 return (KERN_FAILURE);
2240         }
2241         interruptible_state = thread_interrupt_level(interruptible);
2242
2243         VM_STAT_INCR(faults);
2244         current_task()->faults++;
2245         original_fault_type = fault_type;
2246
2247         if (fault_type & VM_PROT_WRITE)
2248                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2249         else
2250                 object_lock_type = OBJECT_LOCK_SHARED;
2251
2252         cur_object_lock_type = OBJECT_LOCK_SHARED;
2253
2254 RetryFault:
2255         /*
2256          * assume we will hit a page in the cache
2257          * otherwise, explicitly override with
2258          * the real fault type once we determine it
2259          */
2260         type_of_fault = DBG_CACHE_HIT_FAULT;
2261
2262         /*
2263          *      Find the backing store object and offset into
2264          *      it to begin the search.
2265          */
2266         fault_type = original_fault_type;
2267         map = original_map;
2268         vm_map_lock_read(map);
2269
2270         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2271                                   object_lock_type, &version,
2272                                   &object, &offset, &prot, &wired,
2273                                   &fault_info,
2274                                   &real_map);
2275
2276         if (kr != KERN_SUCCESS) {
2277                 vm_map_unlock_read(map);
2278                 goto done;
2279         }
2280         pmap = real_map->pmap;
2281         fault_info.interruptible = interruptible;
2282
2283         /*
2284          * If the page is wired, we must fault for the current protection
2285          * value, to avoid further faults.
2286          */
2287         if (wired) {
2288                 fault_type = prot | VM_PROT_WRITE;
2289                 /*
2290                  * since we're treating this fault as a 'write'
2291                  * we must hold the top object lock exclusively
2292                  */
2293                 if (object_lock_type == OBJECT_LOCK_SHARED) {
2294
2295                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2296
2297                         if (vm_object_lock_upgrade(object) == FALSE) {
2298                                 /*
2299                                  * couldn't upgrade, so explictly
2300                                  * take the lock exclusively
2301                                  */
2302                                 vm_object_lock(object);
2303                         }
2304                 }
2305         }
2306
2307 #if     VM_FAULT_CLASSIFY
2308         /*
2309          *      Temporary data gathering code
2310          */
2311         vm_fault_classify(object, offset, fault_type);
2312 #endif
2313         /*
2314          *      Fast fault code.  The basic idea is to do as much as
2315          *      possible while holding the map lock and object locks.
2316          *      Busy pages are not used until the object lock has to
2317          *      be dropped to do something (copy, zero fill, pmap enter).
2318          *      Similarly, paging references aren't acquired until that
2319          *      point, and object references aren't used.
2320          *
2321          *      If we can figure out what to do
2322          *      (zero fill, copy on write, pmap enter) while holding
2323          *      the locks, then it gets done.  Otherwise, we give up,
2324          *      and use the original fault path (which doesn't hold
2325          *      the map lock, and relies on busy pages).
2326          *      The give up cases include:
2327          *              - Have to talk to pager.
2328          *              - Page is busy, absent or in error.
2329          *              - Pager has locked out desired access.
2330          *              - Fault needs to be restarted.
2331          *              - Have to push page into copy object.
2332          *
2333          *      The code is an infinite loop that moves one level down
2334          *      the shadow chain each time.  cur_object and cur_offset
2335          *      refer to the current object being examined. object and offset
2336          *      are the original object from the map.  The loop is at the
2337          *      top level if and only if object and cur_object are the same.
2338          *
2339          *      Invariants:  Map lock is held throughout.  Lock is held on
2340          *              original object and cur_object (if different) when
2341          *              continuing or exiting loop.
2342          *
2343          */
2344
2345
2346         /*
2347          * If this page is to be inserted in a copy delay object
2348          * for writing, and if the object has a copy, then the
2349          * copy delay strategy is implemented in the slow fault page.
2350          */
2351         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2352             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2353                 goto handle_copy_delay;
2354
2355         cur_object = object;
2356         cur_offset = offset;
2357
2358         while (TRUE) {
2359                 m = vm_page_lookup(cur_object, cur_offset);
2360
2361                 if (m != VM_PAGE_NULL) {
2362                         if (m->busy) {
2363                                 wait_result_t   result;
2364
2365                                 /*
2366                                  * in order to do the PAGE_ASSERT_WAIT, we must
2367                                  * have object that 'm' belongs to locked exclusively
2368                                  */
2369                                 if (object != cur_object) {
2370                                         vm_object_unlock(object);
2371
2372                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2373
2374                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2375
2376                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2377                                                         /*
2378                                                          * couldn't upgrade so go do a full retry
2379                                                          * immediately since we've already dropped
2380                                                          * the top object lock associated with this page
2381                                                          * and the current one got dropped due to the
2382                                                          * failed upgrade... the state is no longer valid
2383                                                          */
2384                                                         vm_map_unlock_read(map);
2385                                                         if (real_map != map)
2386                                                                 vm_map_unlock(real_map);
2387
2388                                                         goto RetryFault;
2389                                                 }
2390                                         }
2391                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2392
2393                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2394
2395                                         if (vm_object_lock_upgrade(object) == FALSE) {
2396                                                 /*
2397                                                  * couldn't upgrade, so explictly take the lock
2398                                                  * exclusively and go relookup the page since we
2399                                                  * will have dropped the object lock and
2400                                                  * a different thread could have inserted
2401                                                  * a page at this offset
2402                                                  * no need for a full retry since we're
2403                                                  * at the top level of the object chain
2404                                                  */
2405                                                 vm_object_lock(object);
2406
2407                                                 continue;
2408                                         }
2409                                 }
2410                                 vm_map_unlock_read(map);
2411                                 if (real_map != map)
2412                                         vm_map_unlock(real_map);
2413
2414                                 result = PAGE_ASSERT_WAIT(m, interruptible);
2415
2416                                 vm_object_unlock(cur_object);
2417
2418                                 if (result == THREAD_WAITING) {
2419                                         result = thread_block(THREAD_CONTINUE_NULL);
2420
2421                                         counter(c_vm_fault_page_block_busy_kernel++);
2422                                 }
2423                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2424                                         goto RetryFault;
2425
2426                                 kr = KERN_ABORTED;
2427                                 goto done;
2428                         }
2429                         if (m->phys_page == vm_page_guard_addr) {
2430                                 /*
2431                                  * Guard page: let the slow path deal with it
2432                                  */
2433                                 break;
2434                         }
2435                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
2436                                 /*
2437                                  * Unusual case... let the slow path deal with it
2438                                  */
2439                                 break;
2440                         }
2441                         if (m->encrypted) {
2442                                 /*
2443                                  * ENCRYPTED SWAP:
2444                                  * We've soft-faulted (because it's not in the page
2445                                  * table) on an encrypted page.
2446                                  * Keep the page "busy" so that no one messes with
2447                                  * it during the decryption.
2448                                  * Release the extra locks we're holding, keep only
2449                                  * the page's VM object lock.
2450                                  *
2451                                  * in order to set 'busy' on 'm', we must
2452                                  * have object that 'm' belongs to locked exclusively
2453                                  */
2454                                 if (object != cur_object) {
2455                                         vm_object_unlock(object);
2456
2457                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2458
2459                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2460
2461                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2462                                                         /*
2463                                                          * couldn't upgrade so go do a full retry
2464                                                          * immediately since we've already dropped
2465                                                          * the top object lock associated with this page
2466                                                          * and the current one got dropped due to the
2467                                                          * failed upgrade... the state is no longer valid
2468                                                          */
2469                                                         vm_map_unlock_read(map);
2470                                                         if (real_map != map)
2471                                                                 vm_map_unlock(real_map);
2472
2473                                                         goto RetryFault;
2474                                                 }
2475                                         }
2476                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2477
2478                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2479
2480                                         if (vm_object_lock_upgrade(object) == FALSE) {
2481                                                 /*
2482                                                  * couldn't upgrade, so explictly take the lock
2483                                                  * exclusively and go relookup the page since we
2484                                                  * will have dropped the object lock and
2485                                                  * a different thread could have inserted
2486                                                  * a page at this offset
2487                                                  * no need for a full retry since we're
2488                                                  * at the top level of the object chain
2489                                                  */
2490                                                 vm_object_lock(object);
2491
2492                                                 continue;
2493                                         }
2494                                 }
2495                                 m->busy = TRUE;
2496
2497                                 vm_map_unlock_read(map);
2498                                 if (real_map != map)
2499                                         vm_map_unlock(real_map);
2500
2501                                 vm_page_decrypt(m, 0);
2502
2503                                 assert(m->busy);
2504                                 PAGE_WAKEUP_DONE(m);
2505
2506                                 vm_object_unlock(cur_object);
2507                                 /*
2508                                  * Retry from the top, in case anything
2509                                  * changed while we were decrypting...
2510                                  */
2511                                 goto RetryFault;
2512                         }
2513                         ASSERT_PAGE_DECRYPTED(m);
2514
2515                         if (m->object->code_signed && map != kernel_map &&
2516                             (!m->cs_validated || m->wpmapped)) {
2517                                 /*
2518                                  * We might need to validate this page
2519                                  * against its code signature, so we
2520                                  * want to hold the VM object exclusively.
2521                                  */
2522                                 if (object != cur_object) {
2523                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2524                                                 vm_object_unlock(object);
2525                                                 vm_object_unlock(cur_object);
2526
2527                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2528
2529                                                 vm_map_unlock_read(map);
2530                                                 if (real_map != map)
2531                                                         vm_map_unlock(real_map);
2532
2533                                                 goto RetryFault;
2534                                         }
2535
2536                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2537
2538                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2539
2540                                         if (vm_object_lock_upgrade(object) == FALSE) {
2541                                                 /*
2542                                                  * couldn't upgrade, so explictly take the lock
2543                                                  * exclusively and go relookup the page since we
2544                                                  * will have dropped the object lock and
2545                                                  * a different thread could have inserted
2546                                                  * a page at this offset
2547                                                  * no need for a full retry since we're
2548                                                  * at the top level of the object chain
2549                                                  */
2550                                                 vm_object_lock(object);
2551
2552                                                 continue;
2553                                         }
2554                                 }
2555                         }
2556                         /*
2557                          *      Two cases of map in faults:
2558                          *          - At top level w/o copy object.
2559                          *          - Read fault anywhere.
2560                          *              --> must disallow write.
2561                          */
2562
2563                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
2564                                 if ((fault_type & VM_PROT_WRITE) == 0) {
2565                                         /*
2566                                          * This is not a "write" fault, so we
2567                                          * might not have taken the object lock
2568                                          * exclusively and we might not be able
2569                                          * to update the "wpmapped" bit in
2570                                          * vm_fault_enter().
2571                                          * Let's just grant read access to
2572                                          * the page for now and we'll
2573                                          * soft-fault again if we need write
2574                                          * access later...
2575                                          */
2576                                         prot &= ~VM_PROT_WRITE;
2577                                 }
2578                                 goto FastPmapEnter;
2579                         }
2580
2581                         if ((fault_type & VM_PROT_WRITE) == 0) {
2582
2583                                 prot &= ~VM_PROT_WRITE;
2584
2585                                 /*
2586                                  * Set up to map the page...
2587                                  * mark the page busy, drop
2588                                  * unneeded object lock
2589                                  */
2590                                 if (object != cur_object) {
2591                                         /*
2592                                          * don't need the original object anymore
2593                                          */
2594                                         vm_object_unlock(object);
2595
2596                                         /*
2597                                          * switch to the object that has the new page
2598                                          */
2599                                         object = cur_object;
2600                                         object_lock_type = cur_object_lock_type;
2601                                 }
2602 FastPmapEnter:
2603                                 /*
2604                                  * prepare for the pmap_enter...
2605                                  * object and map are both locked
2606                                  * m contains valid data
2607                                  * object == m->object
2608                                  * cur_object == NULL or it's been unlocked
2609                                  * no paging references on either object or cur_object
2610                                  */
2611 #if     MACH_KDB
2612                                 if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0)
2613                                         prot &= ~VM_PROT_WRITE;
2614 #endif
2615                                 if (caller_pmap) {
2616                                         kr = vm_fault_enter(m,
2617                                                             caller_pmap,
2618                                                             caller_pmap_addr,
2619                                                             prot,
2620                                                             wired,
2621                                                             change_wiring,
2622                                                             fault_info.no_cache,
2623                                                             &type_of_fault);
2624                                 } else {
2625                                         kr = vm_fault_enter(m,
2626                                                             pmap,
2627                                                             vaddr,
2628                                                             prot,
2629                                                             wired,
2630                                                             change_wiring,
2631                                                             fault_info.no_cache,
2632                                                             &type_of_fault);
2633                                 }
2634
2635                                 if (need_collapse == TRUE)
2636                                         vm_object_collapse(object, offset, TRUE);
2637
2638                                 if (type_of_fault == DBG_PAGEIN_FAULT) {
2639                                         /*
2640                                          * evaluate access pattern and update state
2641                                          * vm_fault_deactivate_behind depends on the
2642                                          * state being up to date
2643                                          */
2644                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
2645
2646                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
2647                                 }
2648                                 /*
2649                                  * That's it, clean up and return.
2650                                  */
2651                                 if (m->busy)
2652                                         PAGE_WAKEUP_DONE(m);
2653
2654                                 vm_object_unlock(object);
2655
2656                                 vm_map_unlock_read(map);
2657                                 if (real_map != map)
2658                                         vm_map_unlock(real_map);
2659
2660                                 goto done;
2661                         }
2662                         /*
2663                          * COPY ON WRITE FAULT
2664                          *
2665                          * If objects match, then
2666                          * object->copy must not be NULL (else control
2667                          * would be in previous code block), and we
2668                          * have a potential push into the copy object
2669                          * with which we can't cope with here.
2670                          */
2671                         if (cur_object == object) {
2672                                 /*
2673                                  * must take the slow path to
2674                                  * deal with the copy push
2675                                  */
2676                                 break;
2677                         }
2678                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
2679
2680                         /*
2681                          * This is now a shadow based copy on write
2682                          * fault -- it requires a copy up the shadow
2683                          * chain.
2684                          *
2685                          * Allocate a page in the original top level
2686                          * object. Give up if allocate fails.  Also
2687                          * need to remember current page, as it's the
2688                          * source of the copy.
2689                          *
2690                          * at this point we hold locks on both
2691                          * object and cur_object... no need to take
2692                          * paging refs or mark pages BUSY since
2693                          * we don't drop either object lock until
2694                          * the page has been copied and inserted
2695                          */
2696                         cur_m = m;
2697                         m = vm_page_grab();
2698
2699                         if (m == VM_PAGE_NULL) {
2700                                 /*
2701                                  * no free page currently available...
2702                                  * must take the slow path
2703                                  */
2704                                 break;
2705                         }
2706                         /*
2707                          * Now do the copy.  Mark the source page busy...
2708                          *
2709                          *      NOTE: This code holds the map lock across
2710                          *      the page copy.
2711                          */
2712                         vm_page_copy(cur_m, m);
2713                         vm_page_insert(m, object, offset);
2714                         m->dirty = TRUE;
2715
2716                         /*
2717                          * Now cope with the source page and object
2718                          */
2719                         if (object->ref_count > 1 && cur_m->pmapped)
2720                                 pmap_disconnect(cur_m->phys_page);
2721
2722                         need_collapse = TRUE;
2723
2724                         if (!cur_object->internal &&
2725                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
2726                                 /*
2727                                  * The object from which we've just
2728                                  * copied a page is most probably backed
2729                                  * by a vnode.  We don't want to waste too
2730                                  * much time trying to collapse the VM objects
2731                                  * and create a bottleneck when several tasks
2732                                  * map the same file.
2733                                  */
2734                                 if (cur_object->copy == object) {
2735                                         /*
2736                                          * Shared mapping or no COW yet.
2737                                          * We can never collapse a copy
2738                                          * object into its backing object.
2739                                          */
2740                                         need_collapse = FALSE;
2741                                 } else if (cur_object->copy == object->shadow &&
2742                                            object->shadow->resident_page_count == 0) {
2743                                         /*
2744                                          * Shared mapping after a COW occurred.
2745                                          */
2746                                         need_collapse = FALSE;
2747                                 }
2748                         }
2749                         vm_object_unlock(cur_object);
2750
2751                         if (need_collapse == FALSE)
2752                                 vm_fault_collapse_skipped++;
2753                         vm_fault_collapse_total++;
2754
2755                         type_of_fault = DBG_COW_FAULT;
2756                         VM_STAT_INCR(cow_faults);
2757                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2758                         current_task()->cow_faults++;
2759
2760                         goto FastPmapEnter;
2761
2762                 } else {
2763                         /*
2764                          * No page at cur_object, cur_offset... m == NULL
2765                          */
2766                         if (cur_object->pager_created) {
2767                                 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
2768                                         /*
2769                                          * May have to talk to a pager...
2770                                          * take the slow path.
2771                                          */
2772                                         break;
2773                                 }
2774                                 /*
2775                                  * existence map present and indicates
2776                                  * that the pager doesn't have this page
2777                                  */
2778                         }
2779                         if (cur_object->shadow == VM_OBJECT_NULL) {
2780                                 /*
2781                                  * Zero fill fault.  Page gets
2782                                  * inserted into the original object.
2783                                  */
2784                                 if (cur_object->shadow_severed) {
2785
2786                                         if (object != cur_object)
2787                                                 vm_object_unlock(cur_object);
2788                                         vm_object_unlock(object);
2789
2790                                         vm_map_unlock_read(map);
2791                                         if (real_map != map)
2792                                                 vm_map_unlock(real_map);
2793
2794                                         kr = KERN_MEMORY_ERROR;
2795                                         goto done;
2796                                 }
2797                                 if (VM_PAGE_ZFILL_THROTTLED()) {
2798                                         /*
2799                                          * drop all of our locks...
2800                                          * wait until the free queue is
2801                                          * pumped back up and then
2802                                          * redrive the fault
2803                                          */
2804                                         if (object != cur_object)
2805                                                 vm_object_unlock(cur_object);
2806                                         vm_object_unlock(object);
2807                                         vm_map_unlock_read(map);
2808                                         if (real_map != map)
2809                                                 vm_map_unlock(real_map);
2810
2811                                         if (vm_page_wait((change_wiring) ?
2812                                                          THREAD_UNINT :
2813                                                          THREAD_ABORTSAFE))
2814                                                 goto RetryFault;
2815
2816                                         kr = KERN_ABORTED;
2817                                         goto done;
2818                                 }
2819                                 if (vm_backing_store_low) {
2820                                         /*
2821                                          * we are protecting the system from
2822                                          * backing store exhaustion...
2823                                          * must take the slow path if we're
2824                                          * not privileged
2825                                          */
2826                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
2827                                                 break;
2828                                 }
2829                                 if (cur_object != object) {
2830                                         vm_object_unlock(cur_object);
2831
2832                                         cur_object = object;
2833                                 }
2834                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
2835
2836                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2837
2838                                         if (vm_object_lock_upgrade(object) == FALSE) {
2839                                                 /*
2840                                                  * couldn't upgrade so do a full retry on the fault
2841                                                  * since we dropped the object lock which
2842                                                  * could allow another thread to insert
2843                                                  * a page at this offset
2844                                                  */
2845                                                 vm_map_unlock_read(map);
2846                                                 if (real_map != map)
2847                                                         vm_map_unlock(real_map);
2848
2849                                                 goto RetryFault;
2850                                         }
2851                                 }
2852                                 m = vm_page_alloc(object, offset);
2853
2854                                 if (m == VM_PAGE_NULL) {
2855                                         /*
2856                                          * no free page currently available...
2857                                          * must take the slow path
2858                                          */
2859                                         break;
2860                                 }
2861
2862                                 /*
2863                                  * Now zero fill page...
2864                                  * the page is probably going to
2865                                  * be written soon, so don't bother
2866                                  * to clear the modified bit
2867                                  *
2868                                  *   NOTE: This code holds the map
2869                                  *   lock across the zero fill.
2870                                  */
2871                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
2872
2873                                 goto FastPmapEnter;
2874                         }
2875                         /*
2876                          * On to the next level in the shadow chain
2877                          */
2878                         cur_offset += cur_object->shadow_offset;
2879                         new_object = cur_object->shadow;
2880
2881                         /*
2882                          * take the new_object's lock with the indicated state
2883                          */
2884                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
2885                                 vm_object_lock_shared(new_object);
2886                         else
2887                                 vm_object_lock(new_object);
2888
2889                         if (cur_object != object)
2890                                 vm_object_unlock(cur_object);
2891
2892                         cur_object = new_object;
2893
2894                         continue;
2895                 }
2896         }
2897         /*
2898          * Cleanup from fast fault failure.  Drop any object
2899          * lock other than original and drop map lock.
2900          */
2901         if (object != cur_object)
2902                 vm_object_unlock(cur_object);
2903
2904         /*
2905          * must own the object lock exclusively at this point
2906          */
2907         if (object_lock_type == OBJECT_LOCK_SHARED) {
2908                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2909
2910                 if (vm_object_lock_upgrade(object) == FALSE) {
2911                         /*
2912                          * couldn't upgrade, so explictly
2913                          * take the lock exclusively
2914                          * no need to retry the fault at this
2915                          * point since "vm_fault_page" will
2916                          * completely re-evaluate the state
2917                          */
2918                         vm_object_lock(object);
2919                 }
2920         }
2921
2922 handle_copy_delay:
2923         vm_map_unlock_read(map);
2924         if (real_map != map)
2925                 vm_map_unlock(real_map);
2926
2927         /*
2928          * Make a reference to this object to
2929          * prevent its disposal while we are messing with
2930          * it.  Once we have the reference, the map is free
2931          * to be diddled.  Since objects reference their
2932          * shadows (and copies), they will stay around as well.
2933          */
2934         vm_object_reference_locked(object);
2935         vm_object_paging_begin(object);
2936
2937         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2938
2939         error_code = 0;
2940
2941         kr = vm_fault_page(object, offset, fault_type,
2942                            (change_wiring && !wired),
2943                            &prot, &result_page, &top_page,
2944                            &type_of_fault,
2945                            &error_code, map->no_zero_fill,
2946                            FALSE, &fault_info);
2947
2948         /*
2949          * if kr != VM_FAULT_SUCCESS, then the paging reference
2950          * has been dropped and the object unlocked... the ref_count
2951          * is still held
2952          *
2953          * if kr == VM_FAULT_SUCCESS, then the paging reference
2954          * is still held along with the ref_count on the original object
2955          *
2956          *      if m != NULL, then the object it belongs to
2957          *      is returned locked with a paging reference
2958          *
2959          *      if top_page != NULL, then it's BUSY and the
2960          *      object it belongs to has a paging reference
2961          *      but is returned unlocked
2962          */
2963         if (kr != VM_FAULT_SUCCESS) {
2964                 /*
2965                  * we didn't succeed, lose the object reference immediately.
2966                  */
2967                 vm_object_deallocate(object);
2968
2969                 /*
2970                  * See why we failed, and take corrective action.
2971                  */
2972                 switch (kr) {
2973                 case VM_FAULT_MEMORY_SHORTAGE:
2974                         if (vm_page_wait((change_wiring) ?
2975                                          THREAD_UNINT :
2976                                          THREAD_ABORTSAFE))
2977                                 goto RetryFault;
2978                         /*
2979                          * fall thru
2980                          */
2981                 case VM_FAULT_INTERRUPTED:
2982                         kr = KERN_ABORTED;
2983                         goto done;
2984                 case VM_FAULT_RETRY:
2985                         goto RetryFault;
2986                 case VM_FAULT_MEMORY_ERROR:
2987                         if (error_code)
2988                                 kr = error_code;
2989                         else
2990                                 kr = KERN_MEMORY_ERROR;
2991                         goto done;
2992                 }
2993         }
2994         m = result_page;
2995
2996         if (m != VM_PAGE_NULL) {
2997                 assert((change_wiring && !wired) ?
2998                     (top_page == VM_PAGE_NULL) :
2999                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
3000         }
3001
3002         /*
3003          * What to do with the resulting page from vm_fault_page
3004          * if it doesn't get entered into the physical map:
3005          */
3006 #define RELEASE_PAGE(m)                                 \
3007         MACRO_BEGIN                                     \
3008         PAGE_WAKEUP_DONE(m);                            \
3009         vm_page_lockspin_queues();                      \
3010         if (!m->active && !m->inactive && !m->throttled)\
3011                 vm_page_activate(m);                    \
3012         vm_page_unlock_queues();                        \
3013         MACRO_END
3014
3015         /*
3016          * We must verify that the maps have not changed
3017          * since our last lookup.
3018          */
3019         if (m != VM_PAGE_NULL) {
3020                 old_copy_object = m->object->copy;
3021                 vm_object_unlock(m->object);
3022         } else
3023                 old_copy_object = VM_OBJECT_NULL;
3024
3025         /*
3026          * no object locks are held at this point
3027          */
3028         if ((map != original_map) || !vm_map_verify(map, &version)) {
3029                 vm_object_t             retry_object;
3030                 vm_object_offset_t      retry_offset;
3031                 vm_prot_t               retry_prot;
3032
3033                 /*
3034                  * To avoid trying to write_lock the map while another
3035                  * thread has it read_locked (in vm_map_pageable), we
3036                  * do not try for write permission.  If the page is
3037                  * still writable, we will get write permission.  If it
3038                  * is not, or has been marked needs_copy, we enter the
3039                  * mapping without write permission, and will merely
3040                  * take another fault.
3041                  */
3042                 map = original_map;
3043                 vm_map_lock_read(map);
3044
3045                 kr = vm_map_lookup_locked(&map, vaddr,
3046                                           fault_type & ~VM_PROT_WRITE,
3047                                           OBJECT_LOCK_EXCLUSIVE, &version,
3048                                           &retry_object, &retry_offset, &retry_prot,
3049                                           &wired,
3050                                           &fault_info,
3051                                           &real_map);
3052                 pmap = real_map->pmap;
3053
3054                 if (kr != KERN_SUCCESS) {
3055                         vm_map_unlock_read(map);
3056
3057                         if (m != VM_PAGE_NULL) {
3058                                 /*
3059                                  * retake the lock so that
3060                                  * we can drop the paging reference
3061                                  * in vm_fault_cleanup and do the
3062                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3063                                  */
3064                                 vm_object_lock(m->object);
3065
3066                                 RELEASE_PAGE(m);
3067
3068                                 vm_fault_cleanup(m->object, top_page);
3069                         } else {
3070                                 /*
3071                                  * retake the lock so that
3072                                  * we can drop the paging reference
3073                                  * in vm_fault_cleanup
3074                                  */
3075                                 vm_object_lock(object);
3076
3077                                 vm_fault_cleanup(object, top_page);
3078                         }
3079                         vm_object_deallocate(object);
3080
3081                         goto done;
3082                 }
3083                 vm_object_unlock(retry_object);
3084
3085                 if ((retry_object != object) || (retry_offset != offset)) {
3086
3087                         vm_map_unlock_read(map);
3088                         if (real_map != map)
3089                                 vm_map_unlock(real_map);
3090
3091                         if (m != VM_PAGE_NULL) {
3092                                 /*
3093                                  * retake the lock so that
3094                                  * we can drop the paging reference
3095                                  * in vm_fault_cleanup and do the
3096                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3097                                  */
3098                                 vm_object_lock(m->object);
3099
3100                                 RELEASE_PAGE(m);
3101
3102                                 vm_fault_cleanup(m->object, top_page);
3103                         } else {
3104                                 /*
3105                                  * retake the lock so that
3106                                  * we can drop the paging reference
3107                                  * in vm_fault_cleanup
3108                                  */
3109                                 vm_object_lock(object);
3110
3111                                 vm_fault_cleanup(object, top_page);
3112                         }
3113                         vm_object_deallocate(object);
3114
3115                         goto RetryFault;
3116                 }
3117                 /*
3118                  * Check whether the protection has changed or the object
3119                  * has been copied while we left the map unlocked.
3120                  */
3121                 prot &= retry_prot;
3122         }
3123         if (m != VM_PAGE_NULL) {
3124                 vm_object_lock(m->object);
3125
3126                 if (m->object->copy != old_copy_object) {
3127                         /*
3128                          * The copy object changed while the top-level object
3129                          * was unlocked, so take away write permission.
3130                          */
3131                         prot &= ~VM_PROT_WRITE;
3132                 }
3133         } else
3134                 vm_object_lock(object);
3135
3136         /*
3137          * If we want to wire down this page, but no longer have
3138          * adequate permissions, we must start all over.
3139          */
3140         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3141
3142                 vm_map_verify_done(map, &version);
3143                 if (real_map != map)
3144                         vm_map_unlock(real_map);
3145
3146                 if (m != VM_PAGE_NULL) {
3147                         RELEASE_PAGE(m);
3148
3149                         vm_fault_cleanup(m->object, top_page);
3150                 } else
3151                         vm_fault_cleanup(object, top_page);
3152
3153                 vm_object_deallocate(object);
3154
3155                 goto RetryFault;
3156         }
3157         if (m != VM_PAGE_NULL) {
3158                 /*
3159                  * Put this page into the physical map.
3160                  * We had to do the unlock above because pmap_enter
3161                  * may cause other faults.  The page may be on
3162                  * the pageout queues.  If the pageout daemon comes
3163                  * across the page, it will remove it from the queues.
3164                  */
3165                 if (caller_pmap) {
3166                         kr = vm_fault_enter(m,
3167                                             caller_pmap,
3168                                             caller_pmap_addr,
3169                                             prot,
3170                                             wired,
3171                                             change_wiring,
3172                                             fault_info.no_cache,
3173                                             &type_of_fault);
3174                 } else {
3175                         kr = vm_fault_enter(m,
3176                                             pmap,
3177                                             vaddr,
3178                                             prot,
3179                                             wired,
3180                                             change_wiring,
3181                                             fault_info.no_cache,
3182                                             &type_of_fault);
3183                 }
3184                 if (kr != KERN_SUCCESS) {
3185                         /* abort this page fault */
3186                         vm_map_verify_done(map, &version);
3187                         if (real_map != map)
3188                                 vm_map_unlock(real_map);
3189                         PAGE_WAKEUP_DONE(m);
3190                         vm_fault_cleanup(m->object, top_page);
3191                         vm_object_deallocate(object);
3192                         goto done;
3193                 }
3194         } else {
3195
3196                 vm_map_entry_t          entry;
3197                 vm_map_offset_t         laddr;
3198                 vm_map_offset_t         ldelta, hdelta;
3199
3200                 /*
3201                  * do a pmap block mapping from the physical address
3202                  * in the object
3203                  */
3204
3205 #ifdef ppc
3206                 /* While we do not worry about execution protection in   */
3207                 /* general, certian pages may have instruction execution */
3208                 /* disallowed.  We will check here, and if not allowed   */
3209                 /* to execute, we return with a protection failure.      */
3210
3211                 if ((fault_type & VM_PROT_EXECUTE) &&
3212                         (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) {
3213
3214                         vm_map_verify_done(map, &version);
3215
3216                         if (real_map != map)
3217                                 vm_map_unlock(real_map);
3218
3219                         vm_fault_cleanup(object, top_page);
3220                         vm_object_deallocate(object);
3221
3222                         kr = KERN_PROTECTION_FAILURE;
3223                         goto done;
3224                 }
3225 #endif  /* ppc */
3226
3227                 if (real_map != map)
3228                         vm_map_unlock(real_map);
3229
3230                 if (original_map != map) {
3231                         vm_map_unlock_read(map);
3232                         vm_map_lock_read(original_map);
3233                         map = original_map;
3234                 }
3235                 real_map = map;
3236
3237                 laddr = vaddr;
3238                 hdelta = 0xFFFFF000;
3239                 ldelta = 0xFFFFF000;
3240
3241                 while (vm_map_lookup_entry(map, laddr, &entry)) {
3242                         if (ldelta > (laddr - entry->vme_start))
3243                                 ldelta = laddr - entry->vme_start;
3244                         if (hdelta > (entry->vme_end - laddr))
3245                                 hdelta = entry->vme_end - laddr;
3246                         if (entry->is_sub_map) {
3247
3248                                 laddr = (laddr - entry->vme_start)
3249                                                         + entry->offset;
3250                                 vm_map_lock_read(entry->object.sub_map);
3251
3252                                 if (map != real_map)
3253                                         vm_map_unlock_read(map);
3254                                 if (entry->use_pmap) {
3255                                         vm_map_unlock_read(real_map);
3256                                         real_map = entry->object.sub_map;
3257                                 }
3258                                 map = entry->object.sub_map;
3259
3260                         } else {
3261                                 break;
3262                         }
3263                 }
3264
3265                 if (vm_map_lookup_entry(map, laddr, &entry) &&
3266                                         (entry->object.vm_object != NULL) &&
3267                                         (entry->object.vm_object == object)) {
3268
3269                         if (caller_pmap) {
3270                                 /*
3271                                  * Set up a block mapped area
3272                                  */
3273                                 pmap_map_block(caller_pmap,
3274                                                (addr64_t)(caller_pmap_addr - ldelta),
3275                                                (((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) +
3276                                                 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3277                                                ((ldelta + hdelta) >> 12), prot,
3278                                                (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3279                         } else {
3280                                 /*
3281                                  * Set up a block mapped area
3282                                  */
3283                                 pmap_map_block(real_map->pmap,
3284                                                (addr64_t)(vaddr - ldelta),
3285                                                (((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) +
3286                                                 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3287                                                ((ldelta + hdelta) >> 12), prot,
3288                                                (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3289                         }
3290                 }
3291         }
3292
3293         /*
3294          * Unlock everything, and return
3295          */
3296         vm_map_verify_done(map, &version);
3297         if (real_map != map)
3298                 vm_map_unlock(real_map);
3299
3300         if (m != VM_PAGE_NULL) {
3301                 PAGE_WAKEUP_DONE(m);
3302
3303                 vm_fault_cleanup(m->object, top_page);
3304         } else
3305                 vm_fault_cleanup(object, top_page);
3306
3307         vm_object_deallocate(object);
3308
3309 #undef  RELEASE_PAGE
3310
3311         kr = KERN_SUCCESS;
3312 done:
3313         thread_interrupt_level(interruptible_state);
3314
3315         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3316                               (int)((uint64_t)vaddr >> 32),
3317                               (int)vaddr,
3318                               kr,
3319                               type_of_fault,
3320                               0);
3321
3322         return (kr);
3323 }
3324
3325 /*
3326  *      vm_fault_wire:
3327  *
3328  *      Wire down a range of virtual addresses in a map.
3329  */
3330 kern_return_t
3331 vm_fault_wire(
3332         vm_map_t        map,
3333         vm_map_entry_t  entry,
3334         pmap_t          pmap,
3335         vm_map_offset_t pmap_addr)
3336 {
3337
3338         register vm_map_offset_t        va;
3339         register vm_map_offset_t        end_addr = entry->vme_end;
3340         register kern_return_t  rc;
3341
3342         assert(entry->in_transition);
3343
3344         if ((entry->object.vm_object != NULL) &&
3345                         !entry->is_sub_map &&
3346                         entry->object.vm_object->phys_contiguous) {
3347                 return KERN_SUCCESS;
3348         }
3349
3350         /*
3351          *      Inform the physical mapping system that the
3352          *      range of addresses may not fault, so that
3353          *      page tables and such can be locked down as well.
3354          */
3355
3356         pmap_pageable(pmap, pmap_addr,
3357                 pmap_addr + (end_addr - entry->vme_start), FALSE);
3358
3359         /*
3360          *      We simulate a fault to get the page and enter it
3361          *      in the physical map.
3362          */
3363
3364         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3365                 if ((rc = vm_fault_wire_fast(
3366                         map, va, entry, pmap,
3367                         pmap_addr + (va - entry->vme_start)
3368                         )) != KERN_SUCCESS) {
3369                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3370                                 (pmap == kernel_pmap) ?
3371                                         THREAD_UNINT : THREAD_ABORTSAFE,
3372                                 pmap, pmap_addr + (va - entry->vme_start));
3373                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
3374                 }
3375
3376                 if (rc != KERN_SUCCESS) {
3377                         struct vm_map_entry     tmp_entry = *entry;
3378
3379                         /* unwire wired pages */
3380                         tmp_entry.vme_end = va;
3381                         vm_fault_unwire(map,
3382                                 &tmp_entry, FALSE, pmap, pmap_addr);
3383
3384                         return rc;
3385                 }
3386         }
3387         return KERN_SUCCESS;
3388 }
3389
3390 /*
3391  *      vm_fault_unwire:
3392  *
3393  *      Unwire a range of virtual addresses in a map.
3394  */
3395 void
3396 vm_fault_unwire(
3397         vm_map_t        map,
3398         vm_map_entry_t  entry,
3399         boolean_t       deallocate,
3400         pmap_t          pmap,
3401         vm_map_offset_t pmap_addr)
3402 {
3403         register vm_map_offset_t        va;
3404         register vm_map_offset_t        end_addr = entry->vme_end;
3405         vm_object_t             object;
3406         struct vm_object_fault_info fault_info;
3407
3408         object = (entry->is_sub_map)
3409                         ? VM_OBJECT_NULL : entry->object.vm_object;
3410
3411         /*
3412          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
3413          * do anything since such memory is wired by default.  So we don't have
3414          * anything to undo here.
3415          */
3416
3417         if (object != VM_OBJECT_NULL && object->phys_contiguous)
3418                 return;
3419
3420         fault_info.interruptible = THREAD_UNINT;
3421         fault_info.behavior = entry->behavior;
3422         fault_info.user_tag = entry->alias;
3423         fault_info.lo_offset = entry->offset;
3424         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
3425         fault_info.no_cache = entry->no_cache;
3426
3427         /*
3428          *      Since the pages are wired down, we must be able to
3429          *      get their mappings from the physical map system.
3430          */
3431
3432         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3433
3434                 if (pmap) {
3435                         pmap_change_wiring(pmap,
3436                                            pmap_addr + (va - entry->vme_start), FALSE);
3437                 }
3438                 if (object == VM_OBJECT_NULL) {
3439                         (void) vm_fault(map, va, VM_PROT_NONE,
3440                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
3441                 } else {
3442                         vm_prot_t       prot;
3443                         vm_page_t       result_page;
3444                         vm_page_t       top_page;
3445                         vm_object_t     result_object;
3446                         vm_fault_return_t result;
3447
3448                         fault_info.cluster_size = end_addr - va;
3449
3450                         do {
3451                                 prot = VM_PROT_NONE;
3452
3453                                 vm_object_lock(object);
3454                                 vm_object_paging_begin(object);
3455                                 XPR(XPR_VM_FAULT,
3456                                         "vm_fault_unwire -> vm_fault_page\n",
3457                                         0,0,0,0,0);
3458                                 result = vm_fault_page(
3459                                         object,
3460                                         entry->offset + (va - entry->vme_start),
3461                                         VM_PROT_NONE, TRUE,
3462                                         &prot, &result_page, &top_page,
3463                                         (int *)0,
3464                                         NULL, map->no_zero_fill,
3465                                         FALSE, &fault_info);
3466                         } while (result == VM_FAULT_RETRY);
3467
3468                         /*
3469                          * If this was a mapping to a file on a device that has been forcibly
3470                          * unmounted, then we won't get a page back from vm_fault_page().  Just
3471                          * move on to the next one in case the remaining pages are mapped from
3472                          * different objects.  During a forced unmount, the object is terminated
3473                          * so the alive flag will be false if this happens.  A forced unmount will
3474                          * will occur when an external disk is unplugged before the user does an
3475                          * eject, so we don't want to panic in that situation.
3476                          */
3477
3478                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
3479                                 continue;
3480
3481                         if (result != VM_FAULT_SUCCESS)
3482                                 panic("vm_fault_unwire: failure");
3483
3484                         result_object = result_page->object;
3485
3486                         if (deallocate) {
3487                                 assert(result_page->phys_page !=
3488                                        vm_page_fictitious_addr);
3489                                 pmap_disconnect(result_page->phys_page);
3490                                 VM_PAGE_FREE(result_page);
3491                         } else {
3492                                 vm_page_lockspin_queues();
3493                                 vm_page_unwire(result_page);
3494                                 vm_page_unlock_queues();
3495                                 PAGE_WAKEUP_DONE(result_page);
3496                         }
3497                         vm_fault_cleanup(result_object, top_page);
3498                 }
3499         }
3500
3501         /*
3502          *      Inform the physical mapping system that the range
3503          *      of addresses may fault, so that page tables and
3504          *      such may be unwired themselves.
3505          */
3506
3507         pmap_pageable(pmap, pmap_addr,
3508                 pmap_addr + (end_addr - entry->vme_start), TRUE);
3509
3510 }
3511
3512 /*
3513  *      vm_fault_wire_fast:
3514  *
3515  *      Handle common case of a wire down page fault at the given address.
3516  *      If successful, the page is inserted into the associated physical map.
3517  *      The map entry is passed in to avoid the overhead of a map lookup.
3518  *
3519  *      NOTE: the given address should be truncated to the
3520  *      proper page address.
3521  *
3522  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
3523  *      a standard error specifying why the fault is fatal is returned.
3524  *
3525  *      The map in question must be referenced, and remains so.
3526  *      Caller has a read lock on the map.
3527  *
3528  *      This is a stripped version of vm_fault() for wiring pages.  Anything
3529  *      other than the common case will return KERN_FAILURE, and the caller
3530  *      is expected to call vm_fault().
3531  */
3532 kern_return_t
3533 vm_fault_wire_fast(
3534         __unused vm_map_t       map,
3535         vm_map_offset_t va,
3536         vm_map_entry_t  entry,
3537         pmap_t                  pmap,
3538         vm_map_offset_t pmap_addr)
3539 {
3540         vm_object_t             object;
3541         vm_object_offset_t      offset;
3542         register vm_page_t      m;
3543         vm_prot_t               prot;
3544         thread_t                thread = current_thread();
3545         int                     type_of_fault;
3546         kern_return_t           kr;
3547
3548         VM_STAT_INCR(faults);
3549
3550         if (thread != THREAD_NULL && thread->task != TASK_NULL)
3551           thread->task->faults++;
3552
3553 /*
3554  *      Recovery actions
3555  */
3556
3557 #undef  RELEASE_PAGE
3558 #define RELEASE_PAGE(m) {                               \
3559         PAGE_WAKEUP_DONE(m);                            \
3560         vm_page_lockspin_queues();                      \
3561         vm_page_unwire(m);                              \
3562         vm_page_unlock_queues();                        \
3563 }
3564
3565
3566 #undef  UNLOCK_THINGS
3567 #define UNLOCK_THINGS   {                               \
3568         vm_object_paging_end(object);                      \
3569         vm_object_unlock(object);                          \
3570 }
3571
3572 #undef  UNLOCK_AND_DEALLOCATE
3573 #define UNLOCK_AND_DEALLOCATE   {                       \
3574         UNLOCK_THINGS;                                  \
3575         vm_object_deallocate(object);                   \
3576 }
3577 /*
3578  *      Give up and have caller do things the hard way.
3579  */
3580
3581 #define GIVE_UP {                                       \
3582         UNLOCK_AND_DEALLOCATE;                          \
3583         return(KERN_FAILURE);                           \
3584 }
3585
3586
3587         /*
3588          *      If this entry is not directly to a vm_object, bail out.
3589          */
3590         if (entry->is_sub_map)
3591                 return(KERN_FAILURE);
3592
3593         /*
3594          *      Find the backing store object and offset into it.
3595          */
3596
3597         object = entry->object.vm_object;
3598         offset = (va - entry->vme_start) + entry->offset;
3599         prot = entry->protection;
3600
3601         /*
3602          *      Make a reference to this object to prevent its
3603          *      disposal while we are messing with it.
3604          */
3605
3606         vm_object_lock(object);
3607         vm_object_reference_locked(object);
3608         vm_object_paging_begin(object);
3609
3610         /*
3611          *      INVARIANTS (through entire routine):
3612          *
3613          *      1)      At all times, we must either have the object
3614          *              lock or a busy page in some object to prevent
3615          *              some other thread from trying to bring in
3616          *              the same page.
3617          *
3618          *      2)      Once we have a busy page, we must remove it from
3619          *              the pageout queues, so that the pageout daemon
3620          *              will not grab it away.
3621          *
3622          */
3623
3624         /*
3625          *      Look for page in top-level object.  If it's not there or
3626          *      there's something going on, give up.
3627          * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3628          * decrypt the page before wiring it down.
3629          */
3630         m = vm_page_lookup(object, offset);
3631         if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
3632             (m->unusual && ( m->error || m->restart || m->absent))) {
3633
3634                 GIVE_UP;
3635         }
3636         ASSERT_PAGE_DECRYPTED(m);
3637
3638         if (m->fictitious &&
3639             m->phys_page == vm_page_guard_addr) {
3640                 /*
3641                  * Guard pages are fictitious pages and are never
3642                  * entered into a pmap, so let's say it's been wired...
3643                  */
3644                 kr = KERN_SUCCESS;
3645                 goto done;
3646         }
3647
3648         /*
3649          *      Wire the page down now.  All bail outs beyond this
3650          *      point must unwire the page.
3651          */
3652
3653         vm_page_lockspin_queues();
3654         vm_page_wire(m);
3655         vm_page_unlock_queues();
3656
3657         /*
3658          *      Mark page busy for other threads.
3659          */
3660         assert(!m->busy);
3661         m->busy = TRUE;
3662         assert(!m->absent);
3663
3664         /*
3665          *      Give up if the page is being written and there's a copy object
3666          */
3667         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3668                 RELEASE_PAGE(m);
3669                 GIVE_UP;
3670         }
3671
3672         /*
3673          *      Put this page into the physical map.
3674          */
3675         type_of_fault = DBG_CACHE_HIT_FAULT;
3676         kr = vm_fault_enter(m,
3677                             pmap,
3678                             pmap_addr,
3679                             prot,
3680                             TRUE,
3681                             FALSE,
3682                             FALSE,
3683                             &type_of_fault);
3684
3685 done:
3686         /*
3687          *      Unlock everything, and return
3688          */
3689
3690         PAGE_WAKEUP_DONE(m);
3691         UNLOCK_AND_DEALLOCATE;
3692
3693         return kr;
3694
3695 }
3696
3697 /*
3698  *      Routine:        vm_fault_copy_cleanup
3699  *      Purpose:
3700  *              Release a page used by vm_fault_copy.
3701  */
3702
3703 void
3704 vm_fault_copy_cleanup(
3705         vm_page_t       page,
3706         vm_page_t       top_page)
3707 {
3708         vm_object_t     object = page->object;
3709
3710         vm_object_lock(object);
3711         PAGE_WAKEUP_DONE(page);
3712         vm_page_lockspin_queues();
3713         if (!page->active && !page->inactive && !page->throttled)
3714                 vm_page_activate(page);
3715         vm_page_unlock_queues();
3716         vm_fault_cleanup(object, top_page);
3717 }
3718
3719 void
3720 vm_fault_copy_dst_cleanup(
3721         vm_page_t       page)
3722 {
3723         vm_object_t     object;
3724
3725         if (page != VM_PAGE_NULL) {
3726                 object = page->object;
3727                 vm_object_lock(object);
3728                 vm_page_lockspin_queues();
3729                 vm_page_unwire(page);
3730                 vm_page_unlock_queues();
3731                 vm_object_paging_end(object);
3732                 vm_object_unlock(object);
3733         }
3734 }
3735
3736 /*
3737  *      Routine:        vm_fault_copy
3738  *
3739  *      Purpose:
3740  *              Copy pages from one virtual memory object to another --
3741  *              neither the source nor destination pages need be resident.
3742  *
3743  *              Before actually copying a page, the version associated with
3744  *              the destination address map wil be verified.
3745  *
3746  *      In/out conditions:
3747  *              The caller must hold a reference, but not a lock, to
3748  *              each of the source and destination objects and to the
3749  *              destination map.
3750  *
3751  *      Results:
3752  *              Returns KERN_SUCCESS if no errors were encountered in
3753  *              reading or writing the data.  Returns KERN_INTERRUPTED if
3754  *              the operation was interrupted (only possible if the
3755  *              "interruptible" argument is asserted).  Other return values
3756  *              indicate a permanent error in copying the data.
3757  *
3758  *              The actual amount of data copied will be returned in the
3759  *              "copy_size" argument.  In the event that the destination map
3760  *              verification failed, this amount may be less than the amount
3761  *              requested.
3762  */
3763 kern_return_t
3764 vm_fault_copy(
3765         vm_object_t             src_object,
3766         vm_object_offset_t      src_offset,
3767         vm_map_size_t           *copy_size,             /* INOUT */
3768         vm_object_t             dst_object,
3769         vm_object_offset_t      dst_offset,
3770         vm_map_t                dst_map,
3771         vm_map_version_t         *dst_version,
3772         int                     interruptible)
3773 {
3774         vm_page_t               result_page;
3775
3776         vm_page_t               src_page;
3777         vm_page_t               src_top_page;
3778         vm_prot_t               src_prot;
3779
3780         vm_page_t               dst_page;
3781         vm_page_t               dst_top_page;
3782         vm_prot_t               dst_prot;
3783
3784         vm_map_size_t           amount_left;
3785         vm_object_t             old_copy_object;
3786         kern_return_t           error = 0;
3787
3788         vm_map_size_t           part_size;
3789         struct vm_object_fault_info fault_info_src;
3790         struct vm_object_fault_info fault_info_dst;
3791
3792         /*
3793          * In order not to confuse the clustered pageins, align
3794          * the different offsets on a page boundary.
3795          */
3796
3797 #define RETURN(x)                                       \
3798         MACRO_BEGIN                                     \
3799         *copy_size -= amount_left;                      \
3800         MACRO_RETURN(x);                                \
3801         MACRO_END
3802
3803         amount_left = *copy_size;
3804
3805         fault_info_src.interruptible = interruptible;
3806         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
3807         fault_info_src.user_tag  = 0;
3808         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
3809         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
3810         fault_info_src.no_cache   = FALSE;
3811
3812         fault_info_dst.interruptible = interruptible;
3813         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
3814         fault_info_dst.user_tag  = 0;
3815         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
3816         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
3817         fault_info_dst.no_cache   = FALSE;
3818
3819         do { /* while (amount_left > 0) */
3820                 /*
3821                  * There may be a deadlock if both source and destination
3822                  * pages are the same. To avoid this deadlock, the copy must
3823                  * start by getting the destination page in order to apply
3824                  * COW semantics if any.
3825                  */
3826
3827         RetryDestinationFault: ;
3828
3829                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3830
3831                 vm_object_lock(dst_object);
3832                 vm_object_paging_begin(dst_object);
3833
3834                 fault_info_dst.cluster_size = amount_left;
3835
3836                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3837                 switch (vm_fault_page(dst_object,
3838                                       vm_object_trunc_page(dst_offset),
3839                                       VM_PROT_WRITE|VM_PROT_READ,
3840                                       FALSE,
3841                                       &dst_prot, &dst_page, &dst_top_page,
3842                                       (int *)0,
3843                                       &error,
3844                                       dst_map->no_zero_fill,
3845                                       FALSE, &fault_info_dst)) {
3846                 case VM_FAULT_SUCCESS:
3847                         break;
3848                 case VM_FAULT_RETRY:
3849                         goto RetryDestinationFault;
3850                 case VM_FAULT_MEMORY_SHORTAGE:
3851                         if (vm_page_wait(interruptible))
3852                                 goto RetryDestinationFault;
3853                         /* fall thru */
3854                 case VM_FAULT_INTERRUPTED:
3855                         RETURN(MACH_SEND_INTERRUPTED);
3856                 case VM_FAULT_MEMORY_ERROR:
3857                         if (error)
3858                                 return (error);
3859                         else
3860                                 return(KERN_MEMORY_ERROR);
3861                 }
3862                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3863
3864                 old_copy_object = dst_page->object->copy;
3865
3866                 /*
3867                  * There exists the possiblity that the source and
3868                  * destination page are the same.  But we can't
3869                  * easily determine that now.  If they are the
3870                  * same, the call to vm_fault_page() for the
3871                  * destination page will deadlock.  To prevent this we
3872                  * wire the page so we can drop busy without having
3873                  * the page daemon steal the page.  We clean up the
3874                  * top page  but keep the paging reference on the object
3875                  * holding the dest page so it doesn't go away.
3876                  */
3877
3878                 vm_page_lockspin_queues();
3879                 vm_page_wire(dst_page);
3880                 vm_page_unlock_queues();
3881                 PAGE_WAKEUP_DONE(dst_page);
3882                 vm_object_unlock(dst_page->object);
3883
3884                 if (dst_top_page != VM_PAGE_NULL) {
3885                         vm_object_lock(dst_object);
3886                         VM_PAGE_FREE(dst_top_page);
3887                         vm_object_paging_end(dst_object);
3888                         vm_object_unlock(dst_object);
3889                 }
3890
3891         RetrySourceFault: ;
3892
3893                 if (src_object == VM_OBJECT_NULL) {
3894                         /*
3895                          *      No source object.  We will just
3896                          *      zero-fill the page in dst_object.
3897                          */
3898                         src_page = VM_PAGE_NULL;
3899                         result_page = VM_PAGE_NULL;
3900                 } else {
3901                         vm_object_lock(src_object);
3902                         src_page = vm_page_lookup(src_object,
3903                                                   vm_object_trunc_page(src_offset));
3904                         if (src_page == dst_page) {
3905                                 src_prot = dst_prot;
3906                                 result_page = VM_PAGE_NULL;
3907                         } else {
3908                                 src_prot = VM_PROT_READ;
3909                                 vm_object_paging_begin(src_object);
3910
3911                                 fault_info_src.cluster_size = amount_left;
3912
3913                                 XPR(XPR_VM_FAULT,
3914                                         "vm_fault_copy(2) -> vm_fault_page\n",
3915                                         0,0,0,0,0);
3916                                 switch (vm_fault_page(
3917                                                 src_object,
3918                                                 vm_object_trunc_page(src_offset),
3919                                                 VM_PROT_READ, FALSE,
3920                                                 &src_prot,
3921                                                 &result_page, &src_top_page,
3922                                                 (int *)0, &error, FALSE,
3923                                                 FALSE, &fault_info_src)) {
3924
3925                                 case VM_FAULT_SUCCESS:
3926                                         break;
3927                                 case VM_FAULT_RETRY:
3928                                         goto RetrySourceFault;
3929                                 case VM_FAULT_MEMORY_SHORTAGE:
3930                                         if (vm_page_wait(interruptible))
3931                                                 goto RetrySourceFault;
3932                                         /* fall thru */
3933                                 case VM_FAULT_INTERRUPTED:
3934                                         vm_fault_copy_dst_cleanup(dst_page);
3935                                         RETURN(MACH_SEND_INTERRUPTED);
3936                                 case VM_FAULT_MEMORY_ERROR:
3937                                         vm_fault_copy_dst_cleanup(dst_page);
3938                                         if (error)
3939                                                 return (error);
3940                                         else
3941                                                 return(KERN_MEMORY_ERROR);
3942                                 }
3943
3944
3945                                 assert((src_top_page == VM_PAGE_NULL) ==
3946                                        (result_page->object == src_object));
3947                         }
3948                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3949                         vm_object_unlock(result_page->object);
3950                 }
3951
3952                 if (!vm_map_verify(dst_map, dst_version)) {
3953                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3954                                 vm_fault_copy_cleanup(result_page, src_top_page);
3955                         vm_fault_copy_dst_cleanup(dst_page);
3956                         break;
3957                 }
3958
3959                 vm_object_lock(dst_page->object);
3960
3961                 if (dst_page->object->copy != old_copy_object) {
3962                         vm_object_unlock(dst_page->object);
3963                         vm_map_verify_done(dst_map, dst_version);
3964                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3965                                 vm_fault_copy_cleanup(result_page, src_top_page);
3966                         vm_fault_copy_dst_cleanup(dst_page);
3967                         break;
3968                 }
3969                 vm_object_unlock(dst_page->object);
3970
3971                 /*
3972                  *      Copy the page, and note that it is dirty
3973                  *      immediately.
3974                  */
3975
3976                 if (!page_aligned(src_offset) ||
3977                         !page_aligned(dst_offset) ||
3978                         !page_aligned(amount_left)) {
3979
3980                         vm_object_offset_t      src_po,
3981                                                 dst_po;
3982
3983                         src_po = src_offset - vm_object_trunc_page(src_offset);
3984                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
3985
3986                         if (dst_po > src_po) {
3987                                 part_size = PAGE_SIZE - dst_po;
3988                         } else {
3989                                 part_size = PAGE_SIZE - src_po;
3990                         }
3991                         if (part_size > (amount_left)){
3992                                 part_size = amount_left;
3993                         }
3994
3995                         if (result_page == VM_PAGE_NULL) {
3996                                 vm_page_part_zero_fill(dst_page,
3997                                                         dst_po, part_size);
3998                         } else {
3999                                 vm_page_part_copy(result_page, src_po,
4000                                         dst_page, dst_po, part_size);
4001                                 if(!dst_page->dirty){
4002                                         vm_object_lock(dst_object);
4003                                         dst_page->dirty = TRUE;
4004                                         vm_object_unlock(dst_page->object);
4005                                 }
4006
4007                         }
4008                 } else {
4009                         part_size = PAGE_SIZE;
4010
4011                         if (result_page == VM_PAGE_NULL)
4012                                 vm_page_zero_fill(dst_page);
4013                         else{
4014                                 vm_page_copy(result_page, dst_page);
4015                                 if(!dst_page->dirty){
4016                                         vm_object_lock(dst_object);
4017                                         dst_page->dirty = TRUE;
4018                                         vm_object_unlock(dst_page->object);
4019                                 }
4020                         }
4021
4022                 }
4023
4024                 /*
4025                  *      Unlock everything, and return
4026                  */
4027
4028                 vm_map_verify_done(dst_map, dst_version);
4029
4030                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4031                         vm_fault_copy_cleanup(result_page, src_top_page);
4032                 vm_fault_copy_dst_cleanup(dst_page);
4033
4034                 amount_left -= part_size;
4035                 src_offset += part_size;
4036                 dst_offset += part_size;
4037         } while (amount_left > 0);
4038
4039         RETURN(KERN_SUCCESS);
4040 #undef  RETURN
4041
4042         /*NOTREACHED*/
4043 }
4044
4045 #if     VM_FAULT_CLASSIFY
4046 /*
4047  *      Temporary statistics gathering support.
4048  */
4049
4050 /*
4051  *      Statistics arrays:
4052  */
4053 #define VM_FAULT_TYPES_MAX      5
4054 #define VM_FAULT_LEVEL_MAX      8
4055
4056 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4057
4058 #define VM_FAULT_TYPE_ZERO_FILL 0
4059 #define VM_FAULT_TYPE_MAP_IN    1
4060 #define VM_FAULT_TYPE_PAGER     2
4061 #define VM_FAULT_TYPE_COPY      3
4062 #define VM_FAULT_TYPE_OTHER     4
4063
4064
4065 void
4066 vm_fault_classify(vm_object_t           object,
4067                   vm_object_offset_t    offset,
4068                   vm_prot_t             fault_type)
4069 {
4070         int             type, level = 0;
4071         vm_page_t       m;
4072
4073         while (TRUE) {
4074                 m = vm_page_lookup(object, offset);
4075                 if (m != VM_PAGE_NULL) {
4076                         if (m->busy || m->error || m->restart || m->absent) {
4077                                 type = VM_FAULT_TYPE_OTHER;
4078                                 break;
4079                         }
4080                         if (((fault_type & VM_PROT_WRITE) == 0) ||
4081                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4082                                 type = VM_FAULT_TYPE_MAP_IN;
4083                                 break;
4084                         }
4085                         type = VM_FAULT_TYPE_COPY;
4086                         break;
4087                 }
4088                 else {
4089                         if (object->pager_created) {
4090                                 type = VM_FAULT_TYPE_PAGER;
4091                                 break;
4092                         }
4093                         if (object->shadow == VM_OBJECT_NULL) {
4094                                 type = VM_FAULT_TYPE_ZERO_FILL;
4095                                 break;
4096                         }
4097
4098                         offset += object->shadow_offset;
4099                         object = object->shadow;
4100                         level++;
4101                         continue;
4102                 }
4103         }
4104
4105         if (level > VM_FAULT_LEVEL_MAX)
4106                 level = VM_FAULT_LEVEL_MAX;
4107
4108         vm_fault_stats[type][level] += 1;
4109
4110         return;
4111 }
4112
4113 /* cleanup routine to call from debugger */
4114
4115 void
4116 vm_fault_classify_init(void)
4117 {
4118         int type, level;
4119
4120         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4121                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4122                         vm_fault_stats[type][level] = 0;
4123                 }
4124         }
4125
4126         return;
4127 }
4128 #endif  /* VM_FAULT_CLASSIFY */
4129
4130
4131 extern int cs_validation;
4132
4133 void
4134 vm_page_validate_cs(
4135         vm_page_t       page)
4136 {
4137         vm_object_t             object;
4138         vm_object_offset_t      offset;
4139         vm_map_offset_t         koffset;
4140         vm_map_size_t           ksize;
4141         vm_offset_t             kaddr;
4142         kern_return_t           kr;
4143         memory_object_t         pager;
4144         void                    *blobs;
4145         boolean_t               validated, tainted;
4146         boolean_t               busy_page;
4147
4148         vm_object_lock_assert_held(page->object);
4149
4150         if (!cs_validation) {
4151                 return;
4152         }
4153
4154         if (page->cs_validated && !page->cs_tainted && page->wpmapped) {
4155                 vm_object_lock_assert_exclusive(page->object);
4156
4157                 /*
4158                  * This page has already been validated and found to
4159                  * be valid.  However, it was mapped for "write" access
4160                  * sometime in the past, so we have to check if it was
4161                  * modified.  If so, it needs to be revalidated.
4162                  * If the page was already found to be "tainted", no
4163                  * need to re-validate.
4164                  */
4165                 if (!page->dirty) {
4166                         vm_cs_query_modified++;
4167                         page->dirty = pmap_is_modified(page->phys_page);
4168                 }
4169                 if (page->dirty) {
4170                         /*
4171                          * The page is dirty, so let's clear its
4172                          * "validated" bit and re-validate it.
4173                          */
4174                         if (cs_debug) {
4175                                 printf("CODESIGNING: vm_page_validate_cs: "
4176                                        "page %p obj %p off 0x%llx "
4177                                        "was modified\n",
4178                                        page, page->object, page->offset);
4179                         }
4180                         page->cs_validated = FALSE;
4181                         vm_cs_validated_dirtied++;
4182                 }
4183         }
4184
4185         if (page->cs_validated) {
4186                 return;
4187         }
4188
4189         vm_object_lock_assert_exclusive(page->object);
4190
4191         vm_cs_validates++;
4192
4193         object = page->object;
4194         assert(object->code_signed);
4195         offset = page->offset;
4196
4197         busy_page = page->busy;
4198         if (!busy_page) {
4199                 /* keep page busy while we map (and unlock) the VM object */
4200                 page->busy = TRUE;
4201         }
4202
4203         /*
4204          * Take a paging reference on the VM object
4205          * to protect it from collapse or bypass,
4206          * and keep it from disappearing too.
4207          */
4208         vm_object_paging_begin(object);
4209
4210         /* map the page in the kernel address space */
4211         koffset = 0;
4212         ksize = PAGE_SIZE_64;
4213         kr = vm_paging_map_object(&koffset,
4214                                   page,
4215                                   object,
4216                                   offset,
4217                                   &ksize,
4218                                   FALSE); /* can't unlock object ! */
4219         if (kr != KERN_SUCCESS) {
4220                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
4221         }
4222         kaddr = CAST_DOWN(vm_offset_t, koffset);
4223
4224         /*
4225          * Since we get here to validate a page that was brought in by
4226          * the pager, we know that this pager is all setup and ready
4227          * by now.
4228          */
4229         assert(!object->internal);
4230         assert(object->pager != NULL);
4231         assert(object->pager_ready);
4232
4233         if (!object->alive || object->terminating || object->pager == NULL) {
4234                 /*
4235                  * The object is terminating and we don't have its pager
4236                  * so we can't validate the data...
4237                  */
4238                 goto out;
4239         }
4240
4241         pager = object->pager;
4242         assert(pager != NULL);
4243
4244         kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
4245         if (kr != KERN_SUCCESS) {
4246                 blobs = NULL;
4247         }
4248
4249         /* verify the SHA1 hash for this page */
4250         validated = cs_validate_page(blobs,
4251                                      offset + object->paging_offset,
4252                                      (const void *)kaddr,
4253                                      &tainted);
4254
4255         assert(page->busy);
4256         assert(object == page->object);
4257         vm_object_lock_assert_exclusive(object);
4258
4259         page->cs_validated = validated;
4260         if (validated) {
4261                 page->cs_tainted = tainted;
4262         }
4263
4264 out:
4265         if (!busy_page) {
4266                 PAGE_WAKEUP_DONE(page);
4267         }
4268         if (koffset != 0) {
4269                 /* unmap the map from the kernel address space */
4270                 vm_paging_unmap_object(object, koffset, koffset + ksize);
4271                 koffset = 0;
4272                 ksize = 0;
4273                 kaddr = 0;
4274         }
4275         vm_object_paging_end(object);
4276 }