osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <mach_kdb.h>
  68 #include <libkern/OSAtomic.h>
  69
  70 #include <mach/mach_types.h>
  71 #include <mach/kern_return.h>
  72 #include <mach/message.h>       /* for error codes */
  73 #include <mach/vm_param.h>
  74 #include <mach/vm_behavior.h>
  75 #include <mach/memory_object.h>
  76                                 /* For memory_object_data_{request,unlock} */
  77 #include <mach/sdt.h>
  78
  79 #include <kern/kern_types.h>
  80 #include <kern/host_statistics.h>
  81 #include <kern/counters.h>
  82 #include <kern/task.h>
  83 #include <kern/thread.h>
  84 #include <kern/sched_prim.h>
  85 #include <kern/host.h>
  86 #include <kern/xpr.h>
  87 #include <kern/mach_param.h>
  88 #include <kern/macro_help.h>
  89 #include <kern/zalloc.h>
  90 #include <kern/misc_protos.h>
  91
  92 #include <ppc/proc_reg.h>
  93
  94 #include <vm/vm_fault.h>
  95 #include <vm/vm_map.h>
  96 #include <vm/vm_object.h>
  97 #include <vm/vm_page.h>
  98 #include <vm/vm_kern.h>
  99 #include <vm/pmap.h>
 100 #include <vm/vm_pageout.h>
 101 #include <vm/vm_protos.h>
 102 #include <vm/vm_external.h>
 103 #include <vm/memory_object.h>
 104 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 105
 106 #include <sys/kdebug.h>
 107
 108 #define VM_FAULT_CLASSIFY       0
 109
 110 /* Zero-filled pages are marked "m->zero_fill" and put on the
 111  * special zero-fill inactive queue  only if they belong to
 112  * an object at least this big.
 113  */
 114 #define VM_ZF_OBJECT_SIZE_THRESHOLD     (0x200000)
 115
 116 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 117
 118 int     vm_object_pagein_throttle = 16;
 119
 120 extern int cs_debug;
 121
 122 #if     MACH_KDB
 123 extern struct db_watchpoint *db_watchpoint_list;
 124 #endif  /* MACH_KDB */
 125
 126
 127 /* Forward declarations of internal routines. */
 128 extern kern_return_t vm_fault_wire_fast(
 129                                 vm_map_t        map,
 130                                 vm_map_offset_t va,
 131                                 vm_map_entry_t  entry,
 132                                 pmap_t          pmap,
 133                                 vm_map_offset_t pmap_addr);
 134
 135 extern void vm_fault_continue(void);
 136
 137 extern void vm_fault_copy_cleanup(
 138                                 vm_page_t       page,
 139                                 vm_page_t       top_page);
 140
 141 extern void vm_fault_copy_dst_cleanup(
 142                                 vm_page_t       page);
 143
 144 #if     VM_FAULT_CLASSIFY
 145 extern void vm_fault_classify(vm_object_t       object,
 146                           vm_object_offset_t    offset,
 147                           vm_prot_t             fault_type);
 148
 149 extern void vm_fault_classify_init(void);
 150 #endif
 151
 152 /*
 153  *      Routine:        vm_fault_init
 154  *      Purpose:
 155  *              Initialize our private data structures.
 156  */
 157 void
 158 vm_fault_init(void)
 159 {
 160 }
 161
 162 /*
 163  *      Routine:        vm_fault_cleanup
 164  *      Purpose:
 165  *              Clean up the result of vm_fault_page.
 166  *      Results:
 167  *              The paging reference for "object" is released.
 168  *              "object" is unlocked.
 169  *              If "top_page" is not null,  "top_page" is
 170  *              freed and the paging reference for the object
 171  *              containing it is released.
 172  *
 173  *      In/out conditions:
 174  *              "object" must be locked.
 175  */
 176 void
 177 vm_fault_cleanup(
 178         register vm_object_t    object,
 179         register vm_page_t      top_page)
 180 {
 181         vm_object_paging_end(object);
 182         vm_object_unlock(object);
 183
 184         if (top_page != VM_PAGE_NULL) {
 185                 object = top_page->object;
 186
 187                 vm_object_lock(object);
 188                 VM_PAGE_FREE(top_page);
 189                 vm_object_paging_end(object);
 190                 vm_object_unlock(object);
 191         }
 192 }
 193
 194 #if     MACH_CLUSTER_STATS
 195 #define MAXCLUSTERPAGES 16
 196 struct {
 197         unsigned long pages_in_cluster;
 198         unsigned long pages_at_higher_offsets;
 199         unsigned long pages_at_lower_offsets;
 200 } cluster_stats_in[MAXCLUSTERPAGES];
 201 #define CLUSTER_STAT(clause)    clause
 202 #define CLUSTER_STAT_HIGHER(x)  \
 203         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 204 #define CLUSTER_STAT_LOWER(x)   \
 205          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 206 #define CLUSTER_STAT_CLUSTER(x) \
 207         ((cluster_stats_in[(x)].pages_in_cluster)++)
 208 #else   /* MACH_CLUSTER_STATS */
 209 #define CLUSTER_STAT(clause)
 210 #endif  /* MACH_CLUSTER_STATS */
 211
 212 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 213
 214
 215 boolean_t       vm_page_deactivate_behind = TRUE;
 216 /*
 217  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 218  */
 219 int vm_default_ahead = 0;
 220 int vm_default_behind = MAX_UPL_TRANSFER;
 221
 222 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 223
 224 /*
 225  * vm_page_is_sequential
 226  *
 227  * Determine if sequential access is in progress
 228  * in accordance with the behavior specified.
 229  * Update state to indicate current access pattern.
 230  *
 231  * object must have at least the shared lock held
 232  */
 233 static
 234 void
 235 vm_fault_is_sequential(
 236         vm_object_t             object,
 237         vm_object_offset_t      offset,
 238         vm_behavior_t           behavior)
 239 {
 240         vm_object_offset_t      last_alloc;
 241         int                     sequential;
 242         int                     orig_sequential;
 243
 244         last_alloc = object->last_alloc;
 245         sequential = object->sequential;
 246         orig_sequential = sequential;
 247
 248         switch (behavior) {
 249         case VM_BEHAVIOR_RANDOM:
 250                 /*
 251                  * reset indicator of sequential behavior
 252                  */
 253                 sequential = 0;
 254                 break;
 255
 256         case VM_BEHAVIOR_SEQUENTIAL:
 257                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 258                         /*
 259                          * advance indicator of sequential behavior
 260                          */
 261                         if (sequential < MAX_SEQUENTIAL_RUN)
 262                                 sequential += PAGE_SIZE;
 263                 } else {
 264                         /*
 265                          * reset indicator of sequential behavior
 266                          */
 267                         sequential = 0;
 268                 }
 269                 break;
 270
 271         case VM_BEHAVIOR_RSEQNTL:
 272                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 273                         /*
 274                          * advance indicator of sequential behavior
 275                          */
 276                         if (sequential > -MAX_SEQUENTIAL_RUN)
 277                                 sequential -= PAGE_SIZE;
 278                 } else {
 279                         /*
 280                          * reset indicator of sequential behavior
 281                          */
 282                         sequential = 0;
 283                 }
 284                 break;
 285
 286         case VM_BEHAVIOR_DEFAULT:
 287         default:
 288                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 289                         /*
 290                          * advance indicator of sequential behavior
 291                          */
 292                         if (sequential < 0)
 293                                 sequential = 0;
 294                         if (sequential < MAX_SEQUENTIAL_RUN)
 295                                 sequential += PAGE_SIZE;
 296
 297                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 298                         /*
 299                          * advance indicator of sequential behavior
 300                          */
 301                         if (sequential > 0)
 302                                 sequential = 0;
 303                         if (sequential > -MAX_SEQUENTIAL_RUN)
 304                                 sequential -= PAGE_SIZE;
 305                 } else {
 306                         /*
 307                          * reset indicator of sequential behavior
 308                          */
 309                         sequential = 0;
 310                 }
 311                 break;
 312         }
 313         if (sequential != orig_sequential) {
 314                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 315                         /*
 316                          * if someone else has already updated object->sequential
 317                          * don't bother trying to update it or object->last_alloc
 318                          */
 319                         return;
 320                 }
 321         }
 322         /*
 323          * I'd like to do this with a OSCompareAndSwap64, but that
 324          * doesn't exist for PPC...  however, it shouldn't matter
 325          * that much... last_alloc is maintained so that we can determine
 326          * if a sequential access pattern is taking place... if only
 327          * one thread is banging on this object, no problem with the unprotected
 328          * update... if 2 or more threads are banging away, we run the risk of
 329          * someone seeing a mangled update... however, in the face of multiple
 330          * accesses, no sequential access pattern can develop anyway, so we
 331          * haven't lost any real info.
 332          */
 333         object->last_alloc = offset;
 334 }
 335
 336
 337 /*
 338  * vm_page_deactivate_behind
 339  *
 340  * Determine if sequential access is in progress
 341  * in accordance with the behavior specified.  If
 342  * so, compute a potential page to deactivate and
 343  * deactivate it.
 344  *
 345  * object must be locked.
 346  *
 347  * return TRUE if we actually deactivate a page
 348  */
 349 static
 350 boolean_t
 351 vm_fault_deactivate_behind(
 352         vm_object_t             object,
 353         vm_object_offset_t      offset,
 354         vm_behavior_t           behavior)
 355 {
 356         vm_page_t       m = NULL;
 357         int             sequential_run;
 358         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 359
 360 #if TRACEFAULTPAGE
 361         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 362 #endif
 363
 364         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 365                 /*
 366                  * Do not deactivate pages from the kernel object: they
 367                  * are not intended to become pageable.
 368                  * or we've disabled the deactivate behind mechanism
 369                  */
 370                 return FALSE;
 371         }
 372         if ((sequential_run = object->sequential)) {
 373                   if (sequential_run < 0) {
 374                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 375                           sequential_run = 0 - sequential_run;
 376                   } else {
 377                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 378                   }
 379         }
 380         switch (behavior) {
 381         case VM_BEHAVIOR_RANDOM:
 382                 break;
 383         case VM_BEHAVIOR_SEQUENTIAL:
 384                 if (sequential_run >= (int)PAGE_SIZE)
 385                         m = vm_page_lookup(object, offset - PAGE_SIZE_64);
 386                 break;
 387         case VM_BEHAVIOR_RSEQNTL:
 388                 if (sequential_run >= (int)PAGE_SIZE)
 389                         m = vm_page_lookup(object, offset + PAGE_SIZE_64);
 390                 break;
 391         case VM_BEHAVIOR_DEFAULT:
 392         default:
 393         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 394
 395                 /*
 396                  * determine if the run of sequential accesss has been
 397                  * long enough on an object with default access behavior
 398                  * to consider it for deactivation
 399                  */
 400                 if ((uint64_t)sequential_run >= behind) {
 401                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 402                                 if (offset >= behind)
 403                                         m = vm_page_lookup(object, offset - behind);
 404                         } else {
 405                                 if (offset < -behind)
 406                                         m = vm_page_lookup(object, offset + behind);
 407                         }
 408                 }
 409                 break;
 410         }
 411         }
 412         if (m) {
 413                 if (!m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
 414                         pmap_clear_reference(m->phys_page);
 415                         m->deactivated = TRUE;
 416 #if TRACEFAULTPAGE
 417                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 418 #endif
 419                         return TRUE;
 420                 }
 421         }
 422         return FALSE;
 423 }
 424
 425
 426 /*
 427  * check for various conditions that would
 428  * prevent us from creating a ZF page...
 429  * cleanup is based on being called from vm_fault_page
 430  *
 431  * object must be locked
 432  * object == m->object
 433  */
 434 static vm_fault_return_t
 435 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
 436 {
 437         if (object->shadow_severed) {
 438                 /*
 439                  * the shadow chain was severed
 440                  * just have to return an error at this point
 441                  */
 442                 if (m != VM_PAGE_NULL)
 443                         VM_PAGE_FREE(m);
 444                 vm_fault_cleanup(object, first_m);
 445
 446                 thread_interrupt_level(interruptible_state);
 447
 448                 return (VM_FAULT_MEMORY_ERROR);
 449         }
 450         if (vm_backing_store_low) {
 451                 /*
 452                  * are we protecting the system from
 453                  * backing store exhaustion.  If so
 454                  * sleep unless we are privileged.
 455                  */
 456                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 457
 458                         if (m != VM_PAGE_NULL)
 459                                 VM_PAGE_FREE(m);
 460                         vm_fault_cleanup(object, first_m);
 461
 462                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 463
 464                         thread_block(THREAD_CONTINUE_NULL);
 465                         thread_interrupt_level(interruptible_state);
 466
 467                         return (VM_FAULT_RETRY);
 468                 }
 469         }
 470         if (VM_PAGE_ZFILL_THROTTLED()) {
 471                 /*
 472                  * we're throttling zero-fills...
 473                  * treat this as if we couldn't grab a page
 474                  */
 475                 if (m != VM_PAGE_NULL)
 476                         VM_PAGE_FREE(m);
 477                 vm_fault_cleanup(object, first_m);
 478
 479                 thread_interrupt_level(interruptible_state);
 480
 481                 return (VM_FAULT_MEMORY_SHORTAGE);
 482         }
 483         return (VM_FAULT_SUCCESS);
 484 }
 485
 486
 487 /*
 488  * do the work to zero fill a page and
 489  * inject it into the correct paging queue
 490  *
 491  * m->object must be locked
 492  * page queue lock must NOT be held
 493  */
 494 static int
 495 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 496 {
 497         int my_fault = DBG_ZERO_FILL_FAULT;
 498
 499         /*
 500          * This is is a zero-fill page fault...
 501          *
 502          * Checking the page lock is a waste of
 503          * time;  this page was absent, so
 504          * it can't be page locked by a pager.
 505          *
 506          * we also consider it undefined
 507          * with respect to instruction
 508          * execution.  i.e. it is the responsibility
 509          * of higher layers to call for an instruction
 510          * sync after changing the contents and before
 511          * sending a program into this area.  We
 512          * choose this approach for performance
 513          */
 514         m->pmapped = TRUE;
 515
 516         m->cs_validated = FALSE;
 517         m->cs_tainted = FALSE;
 518
 519         if (no_zero_fill == TRUE)
 520                 my_fault = DBG_NZF_PAGE_FAULT;
 521         else {
 522                 vm_page_zero_fill(m);
 523
 524                 VM_STAT_INCR(zero_fill_count);
 525                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 526         }
 527         assert(!m->laundry);
 528         assert(m->object != kernel_object);
 529         //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
 530
 531         if (!IP_VALID(memory_manager_default) &&
 532                 (m->object->purgable == VM_PURGABLE_DENY ||
 533                  m->object->purgable == VM_PURGABLE_NONVOLATILE)) {
 534                 vm_page_lock_queues();
 535
 536                 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 537                 m->throttled = TRUE;
 538                 vm_page_throttled_count++;
 539
 540                 vm_page_unlock_queues();
 541         } else {
 542                 if (m->object->size > VM_ZF_OBJECT_SIZE_THRESHOLD) {
 543                         m->zero_fill = TRUE;
 544                         OSAddAtomic(1, (SInt32 *)&vm_zf_count);
 545                 }
 546         }
 547         return (my_fault);
 548 }
 549
 550
 551 /*
 552  *      Routine:        vm_fault_page
 553  *      Purpose:
 554  *              Find the resident page for the virtual memory
 555  *              specified by the given virtual memory object
 556  *              and offset.
 557  *      Additional arguments:
 558  *              The required permissions for the page is given
 559  *              in "fault_type".  Desired permissions are included
 560  *              in "protection".
 561  *              fault_info is passed along to determine pagein cluster
 562  *              limits... it contains the expected reference pattern,
 563  *              cluster size if available, etc...
 564  *
 565  *              If the desired page is known to be resident (for
 566  *              example, because it was previously wired down), asserting
 567  *              the "unwiring" parameter will speed the search.
 568  *
 569  *              If the operation can be interrupted (by thread_abort
 570  *              or thread_terminate), then the "interruptible"
 571  *              parameter should be asserted.
 572  *
 573  *      Results:
 574  *              The page containing the proper data is returned
 575  *              in "result_page".
 576  *
 577  *      In/out conditions:
 578  *              The source object must be locked and referenced,
 579  *              and must donate one paging reference.  The reference
 580  *              is not affected.  The paging reference and lock are
 581  *              consumed.
 582  *
 583  *              If the call succeeds, the object in which "result_page"
 584  *              resides is left locked and holding a paging reference.
 585  *              If this is not the original object, a busy page in the
 586  *              original object is returned in "top_page", to prevent other
 587  *              callers from pursuing this same data, along with a paging
 588  *              reference for the original object.  The "top_page" should
 589  *              be destroyed when this guarantee is no longer required.
 590  *              The "result_page" is also left busy.  It is not removed
 591  *              from the pageout queues.
 592  */
 593
 594 vm_fault_return_t
 595 vm_fault_page(
 596         /* Arguments: */
 597         vm_object_t     first_object,   /* Object to begin search */
 598         vm_object_offset_t first_offset,        /* Offset into object */
 599         vm_prot_t       fault_type,     /* What access is requested */
 600         boolean_t       must_be_resident,/* Must page be resident? */
 601         /* Modifies in place: */
 602         vm_prot_t       *protection,    /* Protection for mapping */
 603         /* Returns: */
 604         vm_page_t       *result_page,   /* Page found, if successful */
 605         vm_page_t       *top_page,      /* Page in top object, if
 606                                          * not result_page.  */
 607         int             *type_of_fault, /* if non-null, fill in with type of fault
 608                                          * COW, zero-fill, etc... returned in trace point */
 609         /* More arguments: */
 610         kern_return_t   *error_code,    /* code if page is in error */
 611         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 612 #if MACH_PAGEMAP
 613         boolean_t       data_supply,    /* treat as data_supply if
 614                                          * it is a write fault and a full
 615                                          * page is provided */
 616 #else
 617         __unused boolean_t data_supply,
 618 #endif
 619         vm_object_fault_info_t fault_info)
 620 {
 621         vm_page_t               m;
 622         vm_object_t             object;
 623         vm_object_offset_t      offset;
 624         vm_page_t               first_m;
 625         vm_object_t             next_object;
 626         vm_object_t             copy_object;
 627         boolean_t               look_for_page;
 628         vm_prot_t               access_required = fault_type;
 629         vm_prot_t               wants_copy_flag;
 630         CLUSTER_STAT(int pages_at_higher_offsets;)
 631         CLUSTER_STAT(int pages_at_lower_offsets;)
 632         kern_return_t           wait_result;
 633         boolean_t               interruptible_state;
 634         vm_fault_return_t       error;
 635         int                     my_fault;
 636         uint32_t                try_failed_count;
 637         int                     interruptible; /* how may fault be interrupted? */
 638         memory_object_t         pager;
 639
 640 /*
 641  * MACH page map - an optional optimization where a bit map is maintained
 642  * by the VM subsystem for internal objects to indicate which pages of
 643  * the object currently reside on backing store.  This existence map
 644  * duplicates information maintained by the vnode pager.  It is
 645  * created at the time of the first pageout against the object, i.e.
 646  * at the same time pager for the object is created.  The optimization
 647  * is designed to eliminate pager interaction overhead, if it is
 648  * 'known' that the page does not exist on backing store.
 649  *
 650  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 651  * either marked as paged out in the existence map for the object or no
 652  * existence map exists for the object.  MUST_ASK_PAGER() is one of the
 653  * criteria in the decision to invoke the pager.   It is also used as one
 654  * of the criteria to terminate the scan for adjacent pages in a clustered
 655  * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
 656  * permanent objects.  Note also that if the pager for an internal object
 657  * has not been created, the pager is not invoked regardless of the value
 658  * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
 659  * for which a pager has been created.
 660  *
 661  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 662  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 663  * PAGED_OUT() is used to determine if a page has already been pushed
 664  * into a copy object in order to avoid a redundant page out operation.
 665  */
 666 #if MACH_PAGEMAP
 667 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 668                         != VM_EXTERNAL_STATE_ABSENT)
 669 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 670                         == VM_EXTERNAL_STATE_EXISTS)
 671 #else
 672 #define MUST_ASK_PAGER(o, f) (TRUE)
 673 #define PAGED_OUT(o, f) (FALSE)
 674 #endif
 675
 676 /*
 677  *      Recovery actions
 678  */
 679 #define PREPARE_RELEASE_PAGE(m)                         \
 680         MACRO_BEGIN                                     \
 681         vm_page_lock_queues();                          \
 682         MACRO_END
 683
 684 #define DO_RELEASE_PAGE(m)                              \
 685         MACRO_BEGIN                                     \
 686         PAGE_WAKEUP_DONE(m);                            \
 687         if (!m->active && !m->inactive && !m->throttled)\
 688                 vm_page_activate(m);                    \
 689         vm_page_unlock_queues();                        \
 690         MACRO_END
 691
 692 #define RELEASE_PAGE(m)                                 \
 693         MACRO_BEGIN                                     \
 694         PREPARE_RELEASE_PAGE(m);                        \
 695         DO_RELEASE_PAGE(m);                             \
 696         MACRO_END
 697
 698 #if TRACEFAULTPAGE
 699         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 700 #endif
 701
 702
 703 #if     MACH_KDB
 704                 /*
 705                  *      If there are watchpoints set, then
 706                  *      we don't want to give away write permission
 707                  *      on a read fault.  Make the task write fault,
 708                  *      so that the watchpoint code notices the access.
 709                  */
 710             if (db_watchpoint_list) {
 711                 /*
 712                  *      If we aren't asking for write permission,
 713                  *      then don't give it away.  We're using write
 714                  *      faults to set the dirty bit.
 715                  */
 716                 if (!(fault_type & VM_PROT_WRITE))
 717                         *protection &= ~VM_PROT_WRITE;
 718         }
 719 #endif  /* MACH_KDB */
 720
 721         interruptible = fault_info->interruptible;
 722         interruptible_state = thread_interrupt_level(interruptible);
 723
 724         /*
 725          *      INVARIANTS (through entire routine):
 726          *
 727          *      1)      At all times, we must either have the object
 728          *              lock or a busy page in some object to prevent
 729          *              some other thread from trying to bring in
 730          *              the same page.
 731          *
 732          *              Note that we cannot hold any locks during the
 733          *              pager access or when waiting for memory, so
 734          *              we use a busy page then.
 735          *
 736          *      2)      To prevent another thread from racing us down the
 737          *              shadow chain and entering a new page in the top
 738          *              object before we do, we must keep a busy page in
 739          *              the top object while following the shadow chain.
 740          *
 741          *      3)      We must increment paging_in_progress on any object
 742          *              for which we have a busy page before dropping
 743          *              the object lock
 744          *
 745          *      4)      We leave busy pages on the pageout queues.
 746          *              If the pageout daemon comes across a busy page,
 747          *              it will remove the page from the pageout queues.
 748          */
 749
 750         object = first_object;
 751         offset = first_offset;
 752         first_m = VM_PAGE_NULL;
 753         access_required = fault_type;
 754
 755
 756         XPR(XPR_VM_FAULT,
 757                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 758                 (integer_t)object, offset, fault_type, *protection, 0);
 759
 760         /*
 761          * default type of fault
 762          */
 763         my_fault = DBG_CACHE_HIT_FAULT;
 764
 765         while (TRUE) {
 766 #if TRACEFAULTPAGE
 767                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 768 #endif
 769                 if (!object->alive) {
 770                         /*
 771                          * object is no longer valid
 772                          * clean up and return error
 773                          */
 774                         vm_fault_cleanup(object, first_m);
 775                         thread_interrupt_level(interruptible_state);
 776
 777                         return (VM_FAULT_MEMORY_ERROR);
 778                 }
 779
 780                 /*
 781                  * See whether the page at 'offset' is resident
 782                  */
 783                 m = vm_page_lookup(object, offset);
 784 #if TRACEFAULTPAGE
 785                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 786 #endif
 787                 if (m != VM_PAGE_NULL) {
 788
 789                         if (m->busy) {
 790                                 /*
 791                                  * The page is being brought in,
 792                                  * wait for it and then retry.
 793                                  *
 794                                  * A possible optimization: if the page
 795                                  * is known to be resident, we can ignore
 796                                  * pages that are absent (regardless of
 797                                  * whether they're busy).
 798                                  */
 799 #if TRACEFAULTPAGE
 800                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 801 #endif
 802                                 wait_result = PAGE_SLEEP(object, m, interruptible);
 803                                 XPR(XPR_VM_FAULT,
 804                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 805                                         (integer_t)object, offset,
 806                                         (integer_t)m, 0, 0);
 807                                 counter(c_vm_fault_page_block_busy_kernel++);
 808
 809                                 if (wait_result != THREAD_AWAKENED) {
 810                                         vm_fault_cleanup(object, first_m);
 811                                         thread_interrupt_level(interruptible_state);
 812
 813                                         if (wait_result == THREAD_RESTART)
 814                                                 return (VM_FAULT_RETRY);
 815                                         else
 816                                                 return (VM_FAULT_INTERRUPTED);
 817                                 }
 818                                 continue;
 819                         }
 820
 821                         if (m->phys_page == vm_page_guard_addr) {
 822                                 /*
 823                                  * Guard page: off limits !
 824                                  */
 825                                 if (fault_type == VM_PROT_NONE) {
 826                                         /*
 827                                          * The fault is not requesting any
 828                                          * access to the guard page, so it must
 829                                          * be just to wire or unwire it.
 830                                          * Let's pretend it succeeded...
 831                                          */
 832                                         m->busy = TRUE;
 833                                         *result_page = m;
 834                                         assert(first_m == VM_PAGE_NULL);
 835                                         *top_page = first_m;
 836                                         if (type_of_fault)
 837                                                 *type_of_fault = DBG_GUARD_FAULT;
 838                                         return VM_FAULT_SUCCESS;
 839                                 } else {
 840                                         /*
 841                                          * The fault requests access to the
 842                                          * guard page: let's deny that !
 843                                          */
 844                                         vm_fault_cleanup(object, first_m);
 845                                         thread_interrupt_level(interruptible_state);
 846                                         return VM_FAULT_MEMORY_ERROR;
 847                                 }
 848                         }
 849
 850                         if (m->error) {
 851                                 /*
 852                                  * The page is in error, give up now.
 853                                  */
 854 #if TRACEFAULTPAGE
 855                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
 856 #endif
 857                                 if (error_code)
 858                                         *error_code = KERN_MEMORY_ERROR;
 859                                 VM_PAGE_FREE(m);
 860
 861                                 vm_fault_cleanup(object, first_m);
 862                                 thread_interrupt_level(interruptible_state);
 863
 864                                 return (VM_FAULT_MEMORY_ERROR);
 865                         }
 866                         if (m->restart) {
 867                                 /*
 868                                  * The pager wants us to restart
 869                                  * at the top of the chain,
 870                                  * typically because it has moved the
 871                                  * page to another pager, then do so.
 872                                  */
 873 #if TRACEFAULTPAGE
 874                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 875 #endif
 876                                 VM_PAGE_FREE(m);
 877
 878                                 vm_fault_cleanup(object, first_m);
 879                                 thread_interrupt_level(interruptible_state);
 880
 881                                 return (VM_FAULT_RETRY);
 882                         }
 883                         if (m->absent) {
 884                                 /*
 885                                  * The page isn't busy, but is absent,
 886                                  * therefore it's deemed "unavailable".
 887                                  *
 888                                  * Remove the non-existent page (unless it's
 889                                  * in the top object) and move on down to the
 890                                  * next object (if there is one).
 891                                  */
 892 #if TRACEFAULTPAGE
 893                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
 894 #endif
 895                                 next_object = object->shadow;
 896
 897                                 if (next_object == VM_OBJECT_NULL) {
 898                                         /*
 899                                          * Absent page at bottom of shadow
 900                                          * chain; zero fill the page we left
 901                                          * busy in the first object, and free
 902                                          * the absent page.
 903                                          */
 904                                         assert(!must_be_resident);
 905
 906                                         /*
 907                                          * check for any conditions that prevent
 908                                          * us from creating a new zero-fill page
 909                                          * vm_fault_check will do all of the
 910                                          * fault cleanup in the case of an error condition
 911                                          * including resetting the thread_interrupt_level
 912                                          */
 913                                         error = vm_fault_check(object, m, first_m, interruptible_state);
 914
 915                                         if (error != VM_FAULT_SUCCESS)
 916                                                 return (error);
 917
 918                                         XPR(XPR_VM_FAULT,
 919                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
 920                                                 (integer_t)object, offset,
 921                                                 (integer_t)m,
 922                                                 (integer_t)first_object, 0);
 923
 924                                         if (object != first_object) {
 925                                                 /*
 926                                                  * free the absent page we just found
 927                                                  */
 928                                                 VM_PAGE_FREE(m);
 929
 930                                                 /*
 931                                                  * drop reference and lock on current object
 932                                                  */
 933                                                 vm_object_paging_end(object);
 934                                                 vm_object_unlock(object);
 935
 936                                                 /*
 937                                                  * grab the original page we
 938                                                  * 'soldered' in place and
 939                                                  * retake lock on 'first_object'
 940                                                  */
 941                                                 m = first_m;
 942                                                 first_m = VM_PAGE_NULL;
 943
 944                                                 object = first_object;
 945                                                 offset = first_offset;
 946
 947                                                 vm_object_lock(object);
 948                                         } else {
 949                                                 /*
 950                                                  * we're going to use the absent page we just found
 951                                                  * so convert it to a 'busy' page
 952                                                  */
 953                                                 m->absent = FALSE;
 954                                                 m->busy = TRUE;
 955                                         }
 956                                         /*
 957                                          * zero-fill the page and put it on
 958                                          * the correct paging queue
 959                                          */
 960                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
 961
 962                                         break;
 963                                 } else {
 964                                         if (must_be_resident)
 965                                                 vm_object_paging_end(object);
 966                                         else if (object != first_object) {
 967                                                 vm_object_paging_end(object);
 968                                                 VM_PAGE_FREE(m);
 969                                         } else {
 970                                                 first_m = m;
 971                                                 m->absent = FALSE;
 972                                                 m->busy = TRUE;
 973
 974                                                 vm_page_lockspin_queues();
 975                                                 VM_PAGE_QUEUES_REMOVE(m);
 976                                                 vm_page_unlock_queues();
 977                                         }
 978                                         XPR(XPR_VM_FAULT,
 979                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
 980                                                 (integer_t)object, offset,
 981                                                 (integer_t)next_object,
 982                                                 offset+object->shadow_offset,0);
 983
 984                                         offset += object->shadow_offset;
 985                                         fault_info->lo_offset += object->shadow_offset;
 986                                         fault_info->hi_offset += object->shadow_offset;
 987                                         access_required = VM_PROT_READ;
 988
 989                                         vm_object_lock(next_object);
 990                                         vm_object_unlock(object);
 991                                         object = next_object;
 992                                         vm_object_paging_begin(object);
 993
 994                                         /*
 995                                          * reset to default type of fault
 996                                          */
 997                                         my_fault = DBG_CACHE_HIT_FAULT;
 998
 999                                         continue;
1000                                 }
1001                         }
1002                         if ((m->cleaning)
1003                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1004                             && (fault_type & VM_PROT_WRITE)) {
1005                                 /*
1006                                  * This is a copy-on-write fault that will
1007                                  * cause us to revoke access to this page, but
1008                                  * this page is in the process of being cleaned
1009                                  * in a clustered pageout. We must wait until
1010                                  * the cleaning operation completes before
1011                                  * revoking access to the original page,
1012                                  * otherwise we might attempt to remove a
1013                                  * wired mapping.
1014                                  */
1015 #if TRACEFAULTPAGE
1016                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1017 #endif
1018                                 XPR(XPR_VM_FAULT,
1019                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1020                                         (integer_t)object, offset,
1021                                         (integer_t)m, 0, 0);
1022                                 /*
1023                                  * take an extra ref so that object won't die
1024                                  */
1025                                 vm_object_reference_locked(object);
1026
1027                                 vm_fault_cleanup(object, first_m);
1028
1029                                 counter(c_vm_fault_page_block_backoff_kernel++);
1030                                 vm_object_lock(object);
1031                                 assert(object->ref_count > 0);
1032
1033                                 m = vm_page_lookup(object, offset);
1034
1035                                 if (m != VM_PAGE_NULL && m->cleaning) {
1036                                         PAGE_ASSERT_WAIT(m, interruptible);
1037
1038                                         vm_object_unlock(object);
1039                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1040                                         vm_object_deallocate(object);
1041
1042                                         goto backoff;
1043                                 } else {
1044                                         vm_object_unlock(object);
1045
1046                                         vm_object_deallocate(object);
1047                                         thread_interrupt_level(interruptible_state);
1048
1049                                         return (VM_FAULT_RETRY);
1050                                 }
1051                         }
1052                         if (type_of_fault == NULL && m->speculative) {
1053                                 /*
1054                                  * If we were passed a non-NULL pointer for
1055                                  * "type_of_fault", than we came from
1056                                  * vm_fault... we'll let it deal with
1057                                  * this condition, since it
1058                                  * needs to see m->speculative to correctly
1059                                  * account the pageins, otherwise...
1060                                  * take it off the speculative queue, we'll
1061                                  * let the caller of vm_fault_page deal
1062                                  * with getting it onto the correct queue
1063                                  */
1064                                 vm_page_lockspin_queues();
1065                                 VM_PAGE_QUEUES_REMOVE(m);
1066                                 vm_page_unlock_queues();
1067                         }
1068
1069                         if (m->encrypted) {
1070                                 /*
1071                                  * ENCRYPTED SWAP:
1072                                  * the user needs access to a page that we
1073                                  * encrypted before paging it out.
1074                                  * Decrypt the page now.
1075                                  * Keep it busy to prevent anyone from
1076                                  * accessing it during the decryption.
1077                                  */
1078                                 m->busy = TRUE;
1079                                 vm_page_decrypt(m, 0);
1080                                 assert(object == m->object);
1081                                 assert(m->busy);
1082                                 PAGE_WAKEUP_DONE(m);
1083
1084                                 /*
1085                                  * Retry from the top, in case
1086                                  * something changed while we were
1087                                  * decrypting.
1088                                  */
1089                                 continue;
1090                         }
1091                         ASSERT_PAGE_DECRYPTED(m);
1092
1093                         if (m->object->code_signed) {
1094                                 /*
1095                                  * CODE SIGNING:
1096                                  * We just paged in a page from a signed
1097                                  * memory object but we don't need to
1098                                  * validate it now.  We'll validate it if
1099                                  * when it gets mapped into a user address
1100                                  * space for the first time or when the page
1101                                  * gets copied to another object as a result
1102                                  * of a copy-on-write.
1103                                  */
1104                         }
1105
1106                         /*
1107                          * We mark the page busy and leave it on
1108                          * the pageout queues.  If the pageout
1109                          * deamon comes across it, then it will
1110                          * remove the page from the queue, but not the object
1111                          */
1112 #if TRACEFAULTPAGE
1113                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1114 #endif
1115                         XPR(XPR_VM_FAULT,
1116                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1117                                 (integer_t)object, offset, (integer_t)m, 0, 0);
1118                         assert(!m->busy);
1119                         assert(!m->absent);
1120
1121                         m->busy = TRUE;
1122                         break;
1123                 }
1124
1125
1126                 /*
1127                  * we get here when there is no page present in the object at
1128                  * the offset we're interested in... we'll allocate a page
1129                  * at this point if the pager associated with
1130                  * this object can provide the data or we're the top object...
1131                  * object is locked;  m == NULL
1132                  */
1133                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1134
1135 #if TRACEFAULTPAGE
1136                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1137 #endif
1138                 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) {
1139                         /*
1140                          * Allocate a new page for this object/offset pair
1141                          */
1142                         m = vm_page_grab();
1143 #if TRACEFAULTPAGE
1144                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1145 #endif
1146                         if (m == VM_PAGE_NULL) {
1147
1148                                 vm_fault_cleanup(object, first_m);
1149                                 thread_interrupt_level(interruptible_state);
1150
1151                                 return (VM_FAULT_MEMORY_SHORTAGE);
1152                         }
1153                         vm_page_insert(m, object, offset);
1154                 }
1155                 if (look_for_page && !must_be_resident) {
1156                         kern_return_t   rc;
1157
1158                         /*
1159                          *      If the memory manager is not ready, we
1160                          *      cannot make requests.
1161                          */
1162                         if (!object->pager_ready) {
1163 #if TRACEFAULTPAGE
1164                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1165 #endif
1166                                 if (m != VM_PAGE_NULL)
1167                                         VM_PAGE_FREE(m);
1168
1169                                 XPR(XPR_VM_FAULT,
1170                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1171                                         (integer_t)object, offset, 0, 0, 0);
1172
1173                                 /*
1174                                  * take an extra ref so object won't die
1175                                  */
1176                                 vm_object_reference_locked(object);
1177                                 vm_fault_cleanup(object, first_m);
1178                                 counter(c_vm_fault_page_block_backoff_kernel++);
1179
1180                                 vm_object_lock(object);
1181                                 assert(object->ref_count > 0);
1182
1183                                 if (!object->pager_ready) {
1184                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1185
1186                                         vm_object_unlock(object);
1187                                         if (wait_result == THREAD_WAITING)
1188                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1189                                         vm_object_deallocate(object);
1190
1191                                         goto backoff;
1192                                 } else {
1193                                         vm_object_unlock(object);
1194                                         vm_object_deallocate(object);
1195                                         thread_interrupt_level(interruptible_state);
1196
1197                                         return (VM_FAULT_RETRY);
1198                                 }
1199                         }
1200                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1201                                 /*
1202                                  * If there are too many outstanding page
1203                                  * requests pending on this external object, we
1204                                  * wait for them to be resolved now.
1205                                  */
1206 #if TRACEFAULTPAGE
1207                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1208 #endif
1209                                 if (m != VM_PAGE_NULL)
1210                                         VM_PAGE_FREE(m);
1211                                 /*
1212                                  * take an extra ref so object won't die
1213                                  */
1214                                 vm_object_reference_locked(object);
1215
1216                                 vm_fault_cleanup(object, first_m);
1217
1218                                 counter(c_vm_fault_page_block_backoff_kernel++);
1219
1220                                 vm_object_lock(object);
1221                                 assert(object->ref_count > 0);
1222
1223                                 if (object->paging_in_progress > vm_object_pagein_throttle) {
1224                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible);
1225
1226                                         vm_object_unlock(object);
1227                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1228                                         vm_object_deallocate(object);
1229
1230                                         goto backoff;
1231                                 } else {
1232                                         vm_object_unlock(object);
1233                                         vm_object_deallocate(object);
1234                                         thread_interrupt_level(interruptible_state);
1235
1236                                         return (VM_FAULT_RETRY);
1237                                 }
1238                         }
1239                         if (m != VM_PAGE_NULL) {
1240                                 /*
1241                                  * Indicate that the page is waiting for data
1242                                  * from the memory manager.
1243                                  */
1244                                 m->list_req_pending = TRUE;
1245                                 m->absent = TRUE;
1246                         }
1247
1248 #if TRACEFAULTPAGE
1249                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1250 #endif
1251
1252                         /*
1253                          * It's possible someone called vm_object_destroy while we weren't
1254                          * holding the object lock.  If that has happened, then bail out
1255                          * here.
1256                          */
1257
1258                         pager = object->pager;
1259
1260                         if (pager == MEMORY_OBJECT_NULL) {
1261                                 vm_fault_cleanup(object, first_m);
1262                                 thread_interrupt_level(interruptible_state);
1263                                 return VM_FAULT_MEMORY_ERROR;
1264                         }
1265
1266                         /*
1267                          * We have an absent page in place for the faulting offset,
1268                          * so we can release the object lock.
1269                          */
1270
1271                         vm_object_unlock(object);
1272
1273                         /*
1274                          * If this object uses a copy_call strategy,
1275                          * and we are interested in a copy of this object
1276                          * (having gotten here only by following a
1277                          * shadow chain), then tell the memory manager
1278                          * via a flag added to the desired_access
1279                          * parameter, so that it can detect a race
1280                          * between our walking down the shadow chain
1281                          * and its pushing pages up into a copy of
1282                          * the object that it manages.
1283                          */
1284                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1285                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1286                         else
1287                                 wants_copy_flag = VM_PROT_NONE;
1288
1289                         XPR(XPR_VM_FAULT,
1290                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1291                                 (integer_t)object, offset, (integer_t)m,
1292                                 access_required | wants_copy_flag, 0);
1293
1294                         /*
1295                          * Call the memory manager to retrieve the data.
1296                          */
1297                         rc = memory_object_data_request(
1298                                 pager,
1299                                 offset + object->paging_offset,
1300                                 PAGE_SIZE,
1301                                 access_required | wants_copy_flag,
1302                                 (memory_object_fault_info_t)fault_info);
1303
1304 #if TRACEFAULTPAGE
1305                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1306 #endif
1307                         vm_object_lock(object);
1308
1309                         if (rc != KERN_SUCCESS) {
1310
1311                                 vm_fault_cleanup(object, first_m);
1312                                 thread_interrupt_level(interruptible_state);
1313
1314                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1315                                         VM_FAULT_INTERRUPTED :
1316                                         VM_FAULT_MEMORY_ERROR);
1317                         }
1318                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) {
1319
1320                                 vm_fault_cleanup(object, first_m);
1321                                 thread_interrupt_level(interruptible_state);
1322
1323                                 return (VM_FAULT_INTERRUPTED);
1324                         }
1325                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1326                                 /*
1327                                  * No page here means that the object we
1328                                  * initially looked up was "physically
1329                                  * contiguous" (i.e. device memory).  However,
1330                                  * with Virtual VRAM, the object might not
1331                                  * be backed by that device memory anymore,
1332                                  * so we're done here only if the object is
1333                                  * still "phys_contiguous".
1334                                  * Otherwise, if the object is no longer
1335                                  * "phys_contiguous", we need to retry the
1336                                  * page fault against the object's new backing
1337                                  * store (different memory object).
1338                                  */
1339                                 break;
1340                         }
1341                         /*
1342                          * potentially a pagein fault
1343                          * if we make it through the state checks
1344                          * above, than we'll count it as such
1345                          */
1346                         my_fault = DBG_PAGEIN_FAULT;
1347
1348                         /*
1349                          * Retry with same object/offset, since new data may
1350                          * be in a different page (i.e., m is meaningless at
1351                          * this point).
1352                          */
1353                         continue;
1354                 }
1355
1356                 /*
1357                  * We get here if the object has no pager, or an existence map
1358                  * exists and indicates the page isn't present on the pager
1359                  * or we're unwiring a page.  If a pager exists, but there
1360                  * is no existence map, then the m->absent case above handles
1361                  * the ZF case when the pager can't provide the page
1362                  */
1363 #if TRACEFAULTPAGE
1364                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1365 #endif
1366                 if (object == first_object)
1367                         first_m = m;
1368                 else
1369                         assert(m == VM_PAGE_NULL);
1370
1371                 XPR(XPR_VM_FAULT,
1372                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1373                         (integer_t)object, offset, (integer_t)m,
1374                         (integer_t)object->shadow, 0);
1375
1376                 next_object = object->shadow;
1377
1378                 if (next_object == VM_OBJECT_NULL) {
1379                         /*
1380                          * we've hit the bottom of the shadown chain,
1381                          * fill the page in the top object with zeros.
1382                          */
1383                         assert(!must_be_resident);
1384
1385                         if (object != first_object) {
1386                                 vm_object_paging_end(object);
1387                                 vm_object_unlock(object);
1388
1389                                 object = first_object;
1390                                 offset = first_offset;
1391                                 vm_object_lock(object);
1392                         }
1393                         m = first_m;
1394                         assert(m->object == object);
1395                         first_m = VM_PAGE_NULL;
1396
1397                         /*
1398                          * check for any conditions that prevent
1399                          * us from creating a new zero-fill page
1400                          * vm_fault_check will do all of the
1401                          * fault cleanup in the case of an error condition
1402                          * including resetting the thread_interrupt_level
1403                          */
1404                         error = vm_fault_check(object, m, first_m, interruptible_state);
1405
1406                         if (error != VM_FAULT_SUCCESS)
1407                                 return (error);
1408
1409                         if (m == VM_PAGE_NULL) {
1410                                 m = vm_page_grab();
1411
1412                                 if (m == VM_PAGE_NULL) {
1413                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1414                                         thread_interrupt_level(interruptible_state);
1415
1416                                         return (VM_FAULT_MEMORY_SHORTAGE);
1417                                 }
1418                                 vm_page_insert(m, object, offset);
1419                         }
1420                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1421
1422                         break;
1423
1424                 } else {
1425                         /*
1426                          * Move on to the next object.  Lock the next
1427                          * object before unlocking the current one.
1428                          */
1429                         if ((object != first_object) || must_be_resident)
1430                                 vm_object_paging_end(object);
1431
1432                         offset += object->shadow_offset;
1433                         fault_info->lo_offset += object->shadow_offset;
1434                         fault_info->hi_offset += object->shadow_offset;
1435                         access_required = VM_PROT_READ;
1436
1437                         vm_object_lock(next_object);
1438                         vm_object_unlock(object);
1439
1440                         object = next_object;
1441                         vm_object_paging_begin(object);
1442                 }
1443         }
1444
1445         /*
1446          *      PAGE HAS BEEN FOUND.
1447          *
1448          *      This page (m) is:
1449          *              busy, so that we can play with it;
1450          *              not absent, so that nobody else will fill it;
1451          *              possibly eligible for pageout;
1452          *
1453          *      The top-level page (first_m) is:
1454          *              VM_PAGE_NULL if the page was found in the
1455          *               top-level object;
1456          *              busy, not absent, and ineligible for pageout.
1457          *
1458          *      The current object (object) is locked.  A paging
1459          *      reference is held for the current and top-level
1460          *      objects.
1461          */
1462
1463 #if TRACEFAULTPAGE
1464         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1465 #endif
1466 #if     EXTRA_ASSERTIONS
1467         if (m != VM_PAGE_NULL) {
1468                 assert(m->busy && !m->absent);
1469                 assert((first_m == VM_PAGE_NULL) ||
1470                         (first_m->busy && !first_m->absent &&
1471                          !first_m->active && !first_m->inactive));
1472         }
1473 #endif  /* EXTRA_ASSERTIONS */
1474
1475         /*
1476          * ENCRYPTED SWAP:
1477          * If we found a page, we must have decrypted it before we
1478          * get here...
1479          */
1480         if (m != VM_PAGE_NULL) {
1481                 ASSERT_PAGE_DECRYPTED(m);
1482         }
1483
1484         XPR(XPR_VM_FAULT,
1485             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1486                 (integer_t)object, offset, (integer_t)m,
1487                 (integer_t)first_object, (integer_t)first_m);
1488
1489         /*
1490          * If the page is being written, but isn't
1491          * already owned by the top-level object,
1492          * we have to copy it into a new page owned
1493          * by the top-level object.
1494          */
1495         if ((object != first_object) && (m != VM_PAGE_NULL)) {
1496
1497 #if TRACEFAULTPAGE
1498                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1499 #endif
1500                 if (fault_type & VM_PROT_WRITE) {
1501                         vm_page_t copy_m;
1502
1503                         /*
1504                          * We only really need to copy if we
1505                          * want to write it.
1506                          */
1507                         assert(!must_be_resident);
1508
1509                         /*
1510                          * are we protecting the system from
1511                          * backing store exhaustion.  If so
1512                          * sleep unless we are privileged.
1513                          */
1514                         if (vm_backing_store_low) {
1515                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1516
1517                                         RELEASE_PAGE(m);
1518                                         vm_fault_cleanup(object, first_m);
1519
1520                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1521
1522                                         thread_block(THREAD_CONTINUE_NULL);
1523                                         thread_interrupt_level(interruptible_state);
1524
1525                                         return (VM_FAULT_RETRY);
1526                                 }
1527                         }
1528                         /*
1529                          * If we try to collapse first_object at this
1530                          * point, we may deadlock when we try to get
1531                          * the lock on an intermediate object (since we
1532                          * have the bottom object locked).  We can't
1533                          * unlock the bottom object, because the page
1534                          * we found may move (by collapse) if we do.
1535                          *
1536                          * Instead, we first copy the page.  Then, when
1537                          * we have no more use for the bottom object,
1538                          * we unlock it and try to collapse.
1539                          *
1540                          * Note that we copy the page even if we didn't
1541                          * need to... that's the breaks.
1542                          */
1543
1544                         /*
1545                          * Allocate a page for the copy
1546                          */
1547                         copy_m = vm_page_grab();
1548
1549                         if (copy_m == VM_PAGE_NULL) {
1550                                 RELEASE_PAGE(m);
1551
1552                                 vm_fault_cleanup(object, first_m);
1553                                 thread_interrupt_level(interruptible_state);
1554
1555                                 return (VM_FAULT_MEMORY_SHORTAGE);
1556                         }
1557                         XPR(XPR_VM_FAULT,
1558                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1559                                 (integer_t)object, offset,
1560                                 (integer_t)m, (integer_t)copy_m, 0);
1561
1562                         vm_page_copy(m, copy_m);
1563
1564                         /*
1565                          * If another map is truly sharing this
1566                          * page with us, we have to flush all
1567                          * uses of the original page, since we
1568                          * can't distinguish those which want the
1569                          * original from those which need the
1570                          * new copy.
1571                          *
1572                          * XXXO If we know that only one map has
1573                          * access to this page, then we could
1574                          * avoid the pmap_disconnect() call.
1575                          */
1576                         if (m->pmapped)
1577                                 pmap_disconnect(m->phys_page);
1578
1579                         assert(!m->cleaning);
1580
1581                         /*
1582                          * We no longer need the old page or object.
1583                          */
1584                         PAGE_WAKEUP_DONE(m);
1585                         vm_object_paging_end(object);
1586                         vm_object_unlock(object);
1587
1588                         my_fault = DBG_COW_FAULT;
1589                         VM_STAT_INCR(cow_faults);
1590                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1591                         current_task()->cow_faults++;
1592
1593                         object = first_object;
1594                         offset = first_offset;
1595
1596                         vm_object_lock(object);
1597                         /*
1598                          * get rid of the place holder
1599                          * page that we soldered in earlier
1600                          */
1601                         VM_PAGE_FREE(first_m);
1602                         first_m = VM_PAGE_NULL;
1603
1604                         /*
1605                          * and replace it with the
1606                          * page we just copied into
1607                          */
1608                         assert(copy_m->busy);
1609                         vm_page_insert(copy_m, object, offset);
1610                         copy_m->dirty = TRUE;
1611
1612                         m = copy_m;
1613                         /*
1614                          * Now that we've gotten the copy out of the
1615                          * way, let's try to collapse the top object.
1616                          * But we have to play ugly games with
1617                          * paging_in_progress to do that...
1618                          */
1619                         vm_object_paging_end(object);
1620                         vm_object_collapse(object, offset, TRUE);
1621                         vm_object_paging_begin(object);
1622
1623                 } else
1624                         *protection &= (~VM_PROT_WRITE);
1625         }
1626         /*
1627          * Now check whether the page needs to be pushed into the
1628          * copy object.  The use of asymmetric copy on write for
1629          * shared temporary objects means that we may do two copies to
1630          * satisfy the fault; one above to get the page from a
1631          * shadowed object, and one here to push it into the copy.
1632          */
1633         try_failed_count = 0;
1634
1635         while ((copy_object = first_object->copy) != VM_OBJECT_NULL && (m != VM_PAGE_NULL)) {
1636                 vm_object_offset_t      copy_offset;
1637                 vm_page_t               copy_m;
1638
1639 #if TRACEFAULTPAGE
1640                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1641 #endif
1642                 /*
1643                  * If the page is being written, but hasn't been
1644                  * copied to the copy-object, we have to copy it there.
1645                  */
1646                 if ((fault_type & VM_PROT_WRITE) == 0) {
1647                         *protection &= ~VM_PROT_WRITE;
1648                         break;
1649                 }
1650
1651                 /*
1652                  * If the page was guaranteed to be resident,
1653                  * we must have already performed the copy.
1654                  */
1655                 if (must_be_resident)
1656                         break;
1657
1658                 /*
1659                  * Try to get the lock on the copy_object.
1660                  */
1661                 if (!vm_object_lock_try(copy_object)) {
1662
1663                         vm_object_unlock(object);
1664                         try_failed_count++;
1665
1666                         mutex_pause(try_failed_count);  /* wait a bit */
1667                         vm_object_lock(object);
1668
1669                         continue;
1670                 }
1671                 try_failed_count = 0;
1672
1673                 /*
1674                  * Make another reference to the copy-object,
1675                  * to keep it from disappearing during the
1676                  * copy.
1677                  */
1678                 vm_object_reference_locked(copy_object);
1679
1680                 /*
1681                  * Does the page exist in the copy?
1682                  */
1683                 copy_offset = first_offset - copy_object->shadow_offset;
1684
1685                 if (copy_object->size <= copy_offset)
1686                         /*
1687                          * Copy object doesn't cover this page -- do nothing.
1688                          */
1689                         ;
1690                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1691                         /*
1692                          * Page currently exists in the copy object
1693                          */
1694                         if (copy_m->busy) {
1695                                 /*
1696                                  * If the page is being brought
1697                                  * in, wait for it and then retry.
1698                                  */
1699                                 RELEASE_PAGE(m);
1700
1701                                 /*
1702                                  * take an extra ref so object won't die
1703                                  */
1704                                 vm_object_reference_locked(copy_object);
1705                                 vm_object_unlock(copy_object);
1706                                 vm_fault_cleanup(object, first_m);
1707                                 counter(c_vm_fault_page_block_backoff_kernel++);
1708
1709                                 vm_object_lock(copy_object);
1710                                 assert(copy_object->ref_count > 0);
1711                                 VM_OBJ_RES_DECR(copy_object);
1712                                 vm_object_lock_assert_exclusive(copy_object);
1713                                 copy_object->ref_count--;
1714                                 assert(copy_object->ref_count > 0);
1715                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1716                                 /*
1717                                  * ENCRYPTED SWAP:
1718                                  * it's OK if the "copy_m" page is encrypted,
1719                                  * because we're not moving it nor handling its
1720                                  * contents.
1721                                  */
1722                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1723                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1724
1725                                         vm_object_unlock(copy_object);
1726                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1727                                         vm_object_deallocate(copy_object);
1728
1729                                         goto backoff;
1730                                 } else {
1731                                         vm_object_unlock(copy_object);
1732                                         vm_object_deallocate(copy_object);
1733                                         thread_interrupt_level(interruptible_state);
1734
1735                                         return (VM_FAULT_RETRY);
1736                                 }
1737                         }
1738                 }
1739                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1740                         /*
1741                          * If PAGED_OUT is TRUE, then the page used to exist
1742                          * in the copy-object, and has already been paged out.
1743                          * We don't need to repeat this. If PAGED_OUT is
1744                          * FALSE, then either we don't know (!pager_created,
1745                          * for example) or it hasn't been paged out.
1746                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1747                          * We must copy the page to the copy object.
1748                          */
1749
1750                         if (vm_backing_store_low) {
1751                                 /*
1752                                  * we are protecting the system from
1753                                  * backing store exhaustion.  If so
1754                                  * sleep unless we are privileged.
1755                                  */
1756                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1757                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1758
1759                                         RELEASE_PAGE(m);
1760                                         VM_OBJ_RES_DECR(copy_object);
1761                                         vm_object_lock_assert_exclusive(copy_object);
1762                                         copy_object->ref_count--;
1763                                         assert(copy_object->ref_count > 0);
1764
1765                                         vm_object_unlock(copy_object);
1766                                         vm_fault_cleanup(object, first_m);
1767                                         thread_block(THREAD_CONTINUE_NULL);
1768                                         thread_interrupt_level(interruptible_state);
1769
1770                                         return (VM_FAULT_RETRY);
1771                                 }
1772                         }
1773                         /*
1774                          * Allocate a page for the copy
1775                          */
1776                         copy_m = vm_page_alloc(copy_object, copy_offset);
1777
1778                         if (copy_m == VM_PAGE_NULL) {
1779                                 RELEASE_PAGE(m);
1780
1781                                 VM_OBJ_RES_DECR(copy_object);
1782                                 vm_object_lock_assert_exclusive(copy_object);
1783                                 copy_object->ref_count--;
1784                                 assert(copy_object->ref_count > 0);
1785
1786                                 vm_object_unlock(copy_object);
1787                                 vm_fault_cleanup(object, first_m);
1788                                 thread_interrupt_level(interruptible_state);
1789
1790                                 return (VM_FAULT_MEMORY_SHORTAGE);
1791                         }
1792                         /*
1793                          * Must copy page into copy-object.
1794                          */
1795                         vm_page_copy(m, copy_m);
1796
1797                         /*
1798                          * If the old page was in use by any users
1799                          * of the copy-object, it must be removed
1800                          * from all pmaps.  (We can't know which
1801                          * pmaps use it.)
1802                          */
1803                         if (m->pmapped)
1804                                 pmap_disconnect(m->phys_page);
1805
1806                         /*
1807                          * If there's a pager, then immediately
1808                          * page out this page, using the "initialize"
1809                          * option.  Else, we use the copy.
1810                          */
1811                         if ((!copy_object->pager_created)
1812 #if MACH_PAGEMAP
1813                             || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
1814 #endif
1815                             ) {
1816
1817                                 vm_page_lockspin_queues();
1818                                 assert(!m->cleaning);
1819                                 vm_page_activate(copy_m);
1820                                 vm_page_unlock_queues();
1821
1822                                 copy_m->dirty = TRUE;
1823                                 PAGE_WAKEUP_DONE(copy_m);
1824                         }
1825                         else {
1826                                 assert(copy_m->busy == TRUE);
1827                                 assert(!m->cleaning);
1828
1829                                 /*
1830                                  * dirty is protected by the object lock
1831                                  */
1832                                 copy_m->dirty = TRUE;
1833
1834                                 /*
1835                                  * The page is already ready for pageout:
1836                                  * not on pageout queues and busy.
1837                                  * Unlock everything except the
1838                                  * copy_object itself.
1839                                  */
1840                                 vm_object_unlock(object);
1841
1842                                 /*
1843                                  * Write the page to the copy-object,
1844                                  * flushing it from the kernel.
1845                                  */
1846                                 vm_pageout_initialize_page(copy_m);
1847
1848                                 /*
1849                                  * Since the pageout may have
1850                                  * temporarily dropped the
1851                                  * copy_object's lock, we
1852                                  * check whether we'll have
1853                                  * to deallocate the hard way.
1854                                  */
1855                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
1856                                         vm_object_unlock(copy_object);
1857                                         vm_object_deallocate(copy_object);
1858                                         vm_object_lock(object);
1859
1860                                         continue;
1861                                 }
1862                                 /*
1863                                  * Pick back up the old object's
1864                                  * lock.  [It is safe to do so,
1865                                  * since it must be deeper in the
1866                                  * object tree.]
1867                                  */
1868                                 vm_object_lock(object);
1869                         }
1870                         /*
1871                          * Because we're pushing a page upward
1872                          * in the object tree, we must restart
1873                          * any faults that are waiting here.
1874                          * [Note that this is an expansion of
1875                          * PAGE_WAKEUP that uses the THREAD_RESTART
1876                          * wait result].  Can't turn off the page's
1877                          * busy bit because we're not done with it.
1878                          */
1879                         if (m->wanted) {
1880                                 m->wanted = FALSE;
1881                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
1882                         }
1883                 }
1884                 /*
1885                  * The reference count on copy_object must be
1886                  * at least 2: one for our extra reference,
1887                  * and at least one from the outside world
1888                  * (we checked that when we last locked
1889                  * copy_object).
1890                  */
1891                 vm_object_lock_assert_exclusive(copy_object);
1892                 copy_object->ref_count--;
1893                 assert(copy_object->ref_count > 0);
1894
1895                 VM_OBJ_RES_DECR(copy_object);
1896                 vm_object_unlock(copy_object);
1897
1898                 break;
1899         }
1900         *result_page = m;
1901         *top_page = first_m;
1902
1903         XPR(XPR_VM_FAULT,
1904                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1905                 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1906
1907         if (m != VM_PAGE_NULL) {
1908                 if (my_fault == DBG_PAGEIN_FAULT) {
1909
1910                         VM_STAT_INCR(pageins);
1911                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
1912                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1913                         current_task()->pageins++;
1914
1915                         if (m->object->internal) {
1916                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
1917                         } else {
1918                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
1919                         }
1920
1921                         /*
1922                          * evaluate access pattern and update state
1923                          * vm_fault_deactivate_behind depends on the
1924                          * state being up to date
1925                          */
1926                         vm_fault_is_sequential(object, offset, fault_info->behavior);
1927
1928                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
1929                 }
1930                 if (type_of_fault)
1931                         *type_of_fault = my_fault;
1932         } else
1933                 vm_object_unlock(object);
1934
1935         thread_interrupt_level(interruptible_state);
1936
1937 #if TRACEFAULTPAGE
1938         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
1939 #endif
1940         return (VM_FAULT_SUCCESS);
1941
1942 backoff:
1943         thread_interrupt_level(interruptible_state);
1944
1945         if (wait_result == THREAD_INTERRUPTED)
1946                 return (VM_FAULT_INTERRUPTED);
1947         return (VM_FAULT_RETRY);
1948
1949 #undef  RELEASE_PAGE
1950 }
1951
1952
1953
1954 /*
1955  * page queue lock must NOT be held
1956  * m->object must be locked
1957  *
1958  * NOTE: m->object could be locked "shared" only if we are called
1959  * from vm_fault() as part of a soft fault.  If so, we must be
1960  * careful not to modify the VM object in any way that is not
1961  * legal under a shared lock...
1962  */
1963 unsigned long cs_enter_tainted_rejected = 0;
1964 unsigned long cs_enter_tainted_accepted = 0;
1965 kern_return_t
1966 vm_fault_enter(vm_page_t m,
1967                pmap_t pmap,
1968                vm_map_offset_t vaddr,
1969                vm_prot_t prot,
1970                boolean_t wired,
1971                boolean_t change_wiring,
1972                boolean_t no_cache,
1973                int *type_of_fault)
1974 {
1975         unsigned int    cache_attr;
1976         kern_return_t   kr;
1977         boolean_t       previously_pmapped = m->pmapped;
1978
1979         vm_object_lock_assert_held(m->object);
1980 #if DEBUG
1981         mutex_assert(&vm_page_queue_lock, MA_NOTOWNED);
1982 #endif /* DEBUG */
1983
1984         if (m->phys_page == vm_page_guard_addr) {
1985                 assert(m->fictitious);
1986                 return KERN_SUCCESS;
1987         }
1988
1989         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
1990
1991         if (m->object->code_signed && !m->cs_validated &&
1992             pmap != kernel_pmap) {
1993                 /*
1994                  * CODE SIGNING:
1995                  * This page comes from a VM object backed by a
1996                  * signed memory object and it hasn't been validated yet.
1997                  * We're about to enter it into a process address space,
1998                  * so we need to validate its signature now.
1999                  */
2000                 vm_object_lock_assert_exclusive(m->object);
2001
2002                 /* VM map still locked, so 1 ref will remain on VM object */
2003
2004                 vm_page_validate_cs(m);
2005         }
2006
2007         if (m->pmapped == FALSE) {
2008                 /*
2009                  * This is the first time this page is being
2010                  * mapped in an address space (pmapped == FALSE).
2011                  *
2012                  * Part of that page may still be in the data cache
2013                  * and not flushed to memory.  In case we end up
2014                  * accessing that page via the instruction cache,
2015                  * we need to ensure that the 2 caches are in sync.
2016                  */
2017                 pmap_sync_page_data_phys(m->phys_page);
2018
2019                 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2020                         /*
2021                          * found it in the cache, but this
2022                          * is the first fault-in of the page (m->pmapped == FALSE)
2023                          * so it must have come in as part of
2024                          * a cluster... account 1 pagein against it
2025                          */
2026                         VM_STAT_INCR(pageins);
2027                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2028
2029                         if (m->object->internal) {
2030                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2031                         } else {
2032                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2033                         }
2034
2035                         current_task()->pageins++;
2036
2037                         *type_of_fault = DBG_PAGEIN_FAULT;
2038                 }
2039                 VM_PAGE_CONSUME_CLUSTERED(m);
2040
2041         } else if (cache_attr != VM_WIMG_DEFAULT)
2042                 pmap_sync_page_attributes_phys(m->phys_page);
2043
2044         if (*type_of_fault != DBG_COW_FAULT) {
2045                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2046
2047                 if (pmap == kernel_pmap) {
2048                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2049                 }
2050         }
2051
2052         if (m->cs_tainted) {
2053                 /*
2054                  * CODE SIGNING:
2055                  * This page has been tainted and can not be trusted.
2056                  * Let's notify the current process and let it take any
2057                  * necessary precautions before we enter the tainted page
2058                  * into its address space.
2059                  */
2060                 if (cs_invalid_page()) {
2061                         /* reject the tainted page: abort the page fault */
2062                         kr = KERN_MEMORY_ERROR;
2063                         cs_enter_tainted_rejected++;
2064                 } else {
2065                         /* proceed with the tainted page */
2066                         kr = KERN_SUCCESS;
2067                         cs_enter_tainted_accepted++;
2068                 }
2069                 if (cs_debug || kr != KERN_SUCCESS) {
2070                         printf("CODESIGNING: vm_fault_enter(0x%llx): "
2071                                "page %p obj %p off 0x%llx *** TAINTED ***\n",
2072                                (long long)vaddr, m, m->object, m->offset);
2073                 }
2074         } else {
2075                 /* proceed with the valid page */
2076                 kr = KERN_SUCCESS;
2077         }
2078
2079         if (kr == KERN_SUCCESS) {
2080                 /*
2081                  * NOTE: we may only hold the vm_object lock SHARED
2082                  * at this point, but the update of pmapped is ok
2083                  * since this is the ONLY bit updated behind the SHARED
2084                  * lock... however, we need to figure out how to do an atomic
2085                  * update on a bit field to make this less fragile... right
2086                  * now I don'w know how to coerce 'C' to give me the offset info
2087                  * that's needed for an AtomicCompareAndSwap
2088                  */
2089                 m->pmapped = TRUE;
2090
2091                 PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired);
2092         }
2093
2094         /*
2095          * Hold queues lock to manipulate
2096          * the page queues.  Change wiring
2097          * case is obvious.
2098          */
2099         if (change_wiring) {
2100                 vm_page_lockspin_queues();
2101
2102                 if (wired) {
2103                         if (kr == KERN_SUCCESS) {
2104                                 vm_page_wire(m);
2105                         }
2106                 } else {
2107                         vm_page_unwire(m);
2108                 }
2109                 vm_page_unlock_queues();
2110
2111         } else {
2112                 if (kr != KERN_SUCCESS) {
2113                         vm_page_lock_queues();
2114                         vm_page_deactivate(m);
2115                         vm_page_unlock_queues();
2116                 } else {
2117                         if (((!m->active && !m->inactive) || no_cache) && !m->wire_count && !m->throttled) {
2118                                 vm_page_lockspin_queues();
2119                                 /*
2120                                  * test again now that we hold the page queue lock
2121                                  */
2122                                 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count) {
2123
2124                                         /*
2125                                          * If this is a no_cache mapping and the page has never been
2126                                          * mapped before or was previously a no_cache page, then we
2127                                          * want to leave pages in the speculative state so that they
2128                                          * can be readily recycled if free memory runs low.  Otherwise
2129                                          * the page is activated as normal.
2130                                          */
2131
2132                                         if (no_cache && (!previously_pmapped || m->no_cache)) {
2133                                                 m->no_cache = TRUE;
2134
2135                                                 if (m->active || m->inactive)
2136                                                         VM_PAGE_QUEUES_REMOVE(m);
2137
2138                                                 if (!m->speculative)
2139                                                         vm_page_speculate(m, TRUE);
2140
2141                                         } else if (!m->active && !m->inactive)
2142                                                 vm_page_activate(m);
2143
2144                                 }
2145
2146                                 vm_page_unlock_queues();
2147                         }
2148                 }
2149         }
2150         return kr;
2151 }
2152
2153
2154 /*
2155  *      Routine:        vm_fault
2156  *      Purpose:
2157  *              Handle page faults, including pseudo-faults
2158  *              used to change the wiring status of pages.
2159  *      Returns:
2160  *              Explicit continuations have been removed.
2161  *      Implementation:
2162  *              vm_fault and vm_fault_page save mucho state
2163  *              in the moral equivalent of a closure.  The state
2164  *              structure is allocated when first entering vm_fault
2165  *              and deallocated when leaving vm_fault.
2166  */
2167
2168 extern int _map_enter_debug;
2169
2170 unsigned long vm_fault_collapse_total = 0;
2171 unsigned long vm_fault_collapse_skipped = 0;
2172
2173 kern_return_t
2174 vm_fault(
2175         vm_map_t        map,
2176         vm_map_offset_t vaddr,
2177         vm_prot_t       fault_type,
2178         boolean_t       change_wiring,
2179         int             interruptible,
2180         pmap_t          caller_pmap,
2181         vm_map_offset_t caller_pmap_addr)
2182 {
2183         vm_map_version_t        version;        /* Map version for verificiation */
2184         boolean_t               wired;          /* Should mapping be wired down? */
2185         vm_object_t             object;         /* Top-level object */
2186         vm_object_offset_t      offset;         /* Top-level offset */
2187         vm_prot_t               prot;           /* Protection for mapping */
2188         vm_object_t             old_copy_object; /* Saved copy object */
2189         vm_page_t               result_page;    /* Result of vm_fault_page */
2190         vm_page_t               top_page;       /* Placeholder page */
2191         kern_return_t           kr;
2192
2193         vm_page_t               m;      /* Fast access to result_page */
2194         kern_return_t           error_code;
2195         vm_object_t             cur_object;
2196         vm_object_offset_t      cur_offset;
2197         vm_page_t               cur_m;
2198         vm_object_t             new_object;
2199         int                     type_of_fault;
2200         pmap_t                  pmap;
2201         boolean_t               interruptible_state;
2202         vm_map_t                real_map = map;
2203         vm_map_t                original_map = map;
2204         vm_prot_t               original_fault_type;
2205         struct vm_object_fault_info fault_info;
2206         boolean_t               need_collapse = FALSE;
2207         int                     object_lock_type = 0;
2208         int                     cur_object_lock_type;
2209
2210
2211         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2212                               (int)((uint64_t)vaddr >> 32),
2213                               (int)vaddr,
2214                               0,
2215                               0,
2216                               0);
2217
2218         if (get_preemption_level() != 0) {
2219                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2220                                       (int)((uint64_t)vaddr >> 32),
2221                                       (int)vaddr,
2222                                       KERN_FAILURE,
2223                                       0,
2224                                       0);
2225
2226                 return (KERN_FAILURE);
2227         }
2228         interruptible_state = thread_interrupt_level(interruptible);
2229
2230         VM_STAT_INCR(faults);
2231         current_task()->faults++;
2232         original_fault_type = fault_type;
2233
2234         if (fault_type & VM_PROT_WRITE)
2235                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2236         else
2237                 object_lock_type = OBJECT_LOCK_SHARED;
2238
2239         cur_object_lock_type = OBJECT_LOCK_SHARED;
2240
2241 RetryFault:
2242         /*
2243          * assume we will hit a page in the cache
2244          * otherwise, explicitly override with
2245          * the real fault type once we determine it
2246          */
2247         type_of_fault = DBG_CACHE_HIT_FAULT;
2248
2249         /*
2250          *      Find the backing store object and offset into
2251          *      it to begin the search.
2252          */
2253         fault_type = original_fault_type;
2254         map = original_map;
2255         vm_map_lock_read(map);
2256
2257         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2258                                   object_lock_type, &version,
2259                                   &object, &offset, &prot, &wired,
2260                                   &fault_info,
2261                                   &real_map);
2262
2263         if (kr != KERN_SUCCESS) {
2264                 vm_map_unlock_read(map);
2265                 goto done;
2266         }
2267         pmap = real_map->pmap;
2268         fault_info.interruptible = interruptible;
2269
2270         /*
2271          * If the page is wired, we must fault for the current protection
2272          * value, to avoid further faults.
2273          */
2274         if (wired) {
2275                 fault_type = prot | VM_PROT_WRITE;
2276
2277                 /*
2278                  * since we're treating this fault as a 'write'
2279                  * we must hold the top object lock exclusively
2280                  */
2281                 if (object_lock_type == OBJECT_LOCK_SHARED) {
2282
2283                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2284
2285                         if (vm_object_lock_upgrade(object) == FALSE) {
2286                                 /*
2287                                  * couldn't upgrade, so explictly
2288                                  * take the lock exclusively
2289                                  */
2290                                 vm_object_lock(object);
2291                         }
2292                 }
2293         }
2294
2295 #if     VM_FAULT_CLASSIFY
2296         /*
2297          *      Temporary data gathering code
2298          */
2299         vm_fault_classify(object, offset, fault_type);
2300 #endif
2301         /*
2302          *      Fast fault code.  The basic idea is to do as much as
2303          *      possible while holding the map lock and object locks.
2304          *      Busy pages are not used until the object lock has to
2305          *      be dropped to do something (copy, zero fill, pmap enter).
2306          *      Similarly, paging references aren't acquired until that
2307          *      point, and object references aren't used.
2308          *
2309          *      If we can figure out what to do
2310          *      (zero fill, copy on write, pmap enter) while holding
2311          *      the locks, then it gets done.  Otherwise, we give up,
2312          *      and use the original fault path (which doesn't hold
2313          *      the map lock, and relies on busy pages).
2314          *      The give up cases include:
2315          *              - Have to talk to pager.
2316          *              - Page is busy, absent or in error.
2317          *              - Pager has locked out desired access.
2318          *              - Fault needs to be restarted.
2319          *              - Have to push page into copy object.
2320          *
2321          *      The code is an infinite loop that moves one level down
2322          *      the shadow chain each time.  cur_object and cur_offset
2323          *      refer to the current object being examined. object and offset
2324          *      are the original object from the map.  The loop is at the
2325          *      top level if and only if object and cur_object are the same.
2326          *
2327          *      Invariants:  Map lock is held throughout.  Lock is held on
2328          *              original object and cur_object (if different) when
2329          *              continuing or exiting loop.
2330          *
2331          */
2332
2333
2334         /*
2335          * If this page is to be inserted in a copy delay object
2336          * for writing, and if the object has a copy, then the
2337          * copy delay strategy is implemented in the slow fault page.
2338          */
2339         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2340             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2341                 goto handle_copy_delay;
2342
2343         cur_object = object;
2344         cur_offset = offset;
2345
2346         while (TRUE) {
2347                 m = vm_page_lookup(cur_object, cur_offset);
2348
2349                 if (m != VM_PAGE_NULL) {
2350                         if (m->busy) {
2351                                 wait_result_t   result;
2352
2353                                 /*
2354                                  * in order to do the PAGE_ASSERT_WAIT, we must
2355                                  * have object that 'm' belongs to locked exclusively
2356                                  */
2357                                 if (object != cur_object) {
2358                                         vm_object_unlock(object);
2359
2360                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2361
2362                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2363
2364                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2365                                                         /*
2366                                                          * couldn't upgrade so go do a full retry
2367                                                          * immediately since we've already dropped
2368                                                          * the top object lock associated with this page
2369                                                          * and the current one got dropped due to the
2370                                                          * failed upgrade... the state is no longer valid
2371                                                          */
2372                                                         vm_map_unlock_read(map);
2373                                                         if (real_map != map)
2374                                                                 vm_map_unlock(real_map);
2375
2376                                                         goto RetryFault;
2377                                                 }
2378                                         }
2379                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2380
2381                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2382
2383                                         if (vm_object_lock_upgrade(object) == FALSE) {
2384                                                 /*
2385                                                  * couldn't upgrade, so explictly take the lock
2386                                                  * exclusively and go relookup the page since we
2387                                                  * will have dropped the object lock and
2388                                                  * a different thread could have inserted
2389                                                  * a page at this offset
2390                                                  * no need for a full retry since we're
2391                                                  * at the top level of the object chain
2392                                                  */
2393                                                 vm_object_lock(object);
2394
2395                                                 continue;
2396                                         }
2397                                 }
2398                                 vm_map_unlock_read(map);
2399                                 if (real_map != map)
2400                                         vm_map_unlock(real_map);
2401
2402                                 result = PAGE_ASSERT_WAIT(m, interruptible);
2403
2404                                 vm_object_unlock(cur_object);
2405
2406                                 if (result == THREAD_WAITING) {
2407                                         result = thread_block(THREAD_CONTINUE_NULL);
2408
2409                                         counter(c_vm_fault_page_block_busy_kernel++);
2410                                 }
2411                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2412                                         goto RetryFault;
2413
2414                                 kr = KERN_ABORTED;
2415                                 goto done;
2416                         }
2417                         if (m->phys_page == vm_page_guard_addr) {
2418                                 /*
2419                                  * Guard page: let the slow path deal with it
2420                                  */
2421                                 break;
2422                         }
2423                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
2424                                 /*
2425                                  * Unusual case... let the slow path deal with it
2426                                  */
2427                                 break;
2428                         }
2429                         if (m->encrypted) {
2430                                 /*
2431                                  * ENCRYPTED SWAP:
2432                                  * We've soft-faulted (because it's not in the page
2433                                  * table) on an encrypted page.
2434                                  * Keep the page "busy" so that no one messes with
2435                                  * it during the decryption.
2436                                  * Release the extra locks we're holding, keep only
2437                                  * the page's VM object lock.
2438                                  *
2439                                  * in order to set 'busy' on 'm', we must
2440                                  * have object that 'm' belongs to locked exclusively
2441                                  */
2442                                 if (object != cur_object) {
2443                                         vm_object_unlock(object);
2444
2445                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2446
2447                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2448
2449                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2450                                                         /*
2451                                                          * couldn't upgrade so go do a full retry
2452                                                          * immediately since we've already dropped
2453                                                          * the top object lock associated with this page
2454                                                          * and the current one got dropped due to the
2455                                                          * failed upgrade... the state is no longer valid
2456                                                          */
2457                                                         vm_map_unlock_read(map);
2458                                                         if (real_map != map)
2459                                                                 vm_map_unlock(real_map);
2460
2461                                                         goto RetryFault;
2462                                                 }
2463                                         }
2464                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2465
2466                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2467
2468                                         if (vm_object_lock_upgrade(object) == FALSE) {
2469                                                 /*
2470                                                  * couldn't upgrade, so explictly take the lock
2471                                                  * exclusively and go relookup the page since we
2472                                                  * will have dropped the object lock and
2473                                                  * a different thread could have inserted
2474                                                  * a page at this offset
2475                                                  * no need for a full retry since we're
2476                                                  * at the top level of the object chain
2477                                                  */
2478                                                 vm_object_lock(object);
2479
2480                                                 continue;
2481                                         }
2482                                 }
2483                                 m->busy = TRUE;
2484
2485                                 vm_map_unlock_read(map);
2486                                 if (real_map != map)
2487                                         vm_map_unlock(real_map);
2488
2489                                 vm_page_decrypt(m, 0);
2490
2491                                 assert(m->busy);
2492                                 PAGE_WAKEUP_DONE(m);
2493
2494                                 vm_object_unlock(cur_object);
2495                                 /*
2496                                  * Retry from the top, in case anything
2497                                  * changed while we were decrypting...
2498                                  */
2499                                 goto RetryFault;
2500                         }
2501                         ASSERT_PAGE_DECRYPTED(m);
2502
2503                         if (m->object->code_signed && !m->cs_validated) {
2504                                 /*
2505                                  * We will need to validate this page
2506                                  * against its code signature, so we
2507                                  * want to hold the VM object exclusively.
2508                                  */
2509                                 if (object != cur_object) {
2510                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2511                                                 vm_object_unlock(object);
2512                                                 vm_object_unlock(cur_object);
2513
2514                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2515
2516                                                 vm_map_unlock_read(map);
2517                                                 if (real_map != map)
2518                                                         vm_map_unlock(real_map);
2519
2520                                                 goto RetryFault;
2521                                         }
2522
2523                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2524
2525                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2526
2527                                         if (vm_object_lock_upgrade(object) == FALSE) {
2528                                                 /*
2529                                                  * couldn't upgrade, so explictly take the lock
2530                                                  * exclusively and go relookup the page since we
2531                                                  * will have dropped the object lock and
2532                                                  * a different thread could have inserted
2533                                                  * a page at this offset
2534                                                  * no need for a full retry since we're
2535                                                  * at the top level of the object chain
2536                                                  */
2537                                                 vm_object_lock(object);
2538
2539                                                 continue;
2540                                         }
2541                                 }
2542                         }
2543                         /*
2544                          *      Two cases of map in faults:
2545                          *          - At top level w/o copy object.
2546                          *          - Read fault anywhere.
2547                          *              --> must disallow write.
2548                          */
2549
2550                         if (object == cur_object && object->copy == VM_OBJECT_NULL)
2551                                 goto FastPmapEnter;
2552
2553                         if ((fault_type & VM_PROT_WRITE) == 0) {
2554
2555                                 prot &= ~VM_PROT_WRITE;
2556
2557                                 /*
2558                                  * Set up to map the page...
2559                                  * mark the page busy, drop
2560                                  * unneeded object lock
2561                                  */
2562                                 if (object != cur_object) {
2563                                         /*
2564                                          * don't need the original object anymore
2565                                          */
2566                                         vm_object_unlock(object);
2567
2568                                         /*
2569                                          * switch to the object that has the new page
2570                                          */
2571                                         object = cur_object;
2572                                         object_lock_type = cur_object_lock_type;
2573                                 }
2574 FastPmapEnter:
2575                                 /*
2576                                  * prepare for the pmap_enter...
2577                                  * object and map are both locked
2578                                  * m contains valid data
2579                                  * object == m->object
2580                                  * cur_object == NULL or it's been unlocked
2581                                  * no paging references on either object or cur_object
2582                                  */
2583 #if     MACH_KDB
2584                                 if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0)
2585                                         prot &= ~VM_PROT_WRITE;
2586 #endif
2587                                 if (caller_pmap) {
2588                                         kr = vm_fault_enter(m,
2589                                                             caller_pmap,
2590                                                             caller_pmap_addr,
2591                                                             prot,
2592                                                             wired,
2593                                                             change_wiring,
2594                                                             fault_info.no_cache,
2595                                                             &type_of_fault);
2596                                 } else {
2597                                         kr = vm_fault_enter(m,
2598                                                             pmap,
2599                                                             vaddr,
2600                                                             prot,
2601                                                             wired,
2602                                                             change_wiring,
2603                                                             fault_info.no_cache,
2604                                                             &type_of_fault);
2605                                 }
2606
2607                                 if (need_collapse == TRUE)
2608                                         vm_object_collapse(object, offset, TRUE);
2609
2610                                 if (type_of_fault == DBG_PAGEIN_FAULT) {
2611                                         /*
2612                                          * evaluate access pattern and update state
2613                                          * vm_fault_deactivate_behind depends on the
2614                                          * state being up to date
2615                                          */
2616                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
2617
2618                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
2619                                 }
2620                                 /*
2621                                  * That's it, clean up and return.
2622                                  */
2623                                 if (m->busy)
2624                                         PAGE_WAKEUP_DONE(m);
2625
2626                                 vm_object_unlock(object);
2627
2628                                 vm_map_unlock_read(map);
2629                                 if (real_map != map)
2630                                         vm_map_unlock(real_map);
2631
2632                                 goto done;
2633                         }
2634                         /*
2635                          * COPY ON WRITE FAULT
2636                          *
2637                          * If objects match, then
2638                          * object->copy must not be NULL (else control
2639                          * would be in previous code block), and we
2640                          * have a potential push into the copy object
2641                          * with which we can't cope with here.
2642                          */
2643                         if (cur_object == object) {
2644                                 /*
2645                                  * must take the slow path to
2646                                  * deal with the copy push
2647                                  */
2648                                 break;
2649                         }
2650                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
2651
2652                         /*
2653                          * This is now a shadow based copy on write
2654                          * fault -- it requires a copy up the shadow
2655                          * chain.
2656                          *
2657                          * Allocate a page in the original top level
2658                          * object. Give up if allocate fails.  Also
2659                          * need to remember current page, as it's the
2660                          * source of the copy.
2661                          *
2662                          * at this point we hold locks on both
2663                          * object and cur_object... no need to take
2664                          * paging refs or mark pages BUSY since
2665                          * we don't drop either object lock until
2666                          * the page has been copied and inserted
2667                          */
2668                         cur_m = m;
2669                         m = vm_page_grab();
2670
2671                         if (m == VM_PAGE_NULL) {
2672                                 /*
2673                                  * no free page currently available...
2674                                  * must take the slow path
2675                                  */
2676                                 break;
2677                         }
2678                         /*
2679                          * Now do the copy.  Mark the source page busy...
2680                          *
2681                          *      NOTE: This code holds the map lock across
2682                          *      the page copy.
2683                          */
2684                         vm_page_copy(cur_m, m);
2685                         vm_page_insert(m, object, offset);
2686                         m->dirty = TRUE;
2687
2688                         /*
2689                          * Now cope with the source page and object
2690                          */
2691                         if (object->ref_count > 1 && cur_m->pmapped)
2692                                 pmap_disconnect(cur_m->phys_page);
2693
2694                         need_collapse = TRUE;
2695
2696                         if (!cur_object->internal &&
2697                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
2698                                 /*
2699                                  * The object from which we've just
2700                                  * copied a page is most probably backed
2701                                  * by a vnode.  We don't want to waste too
2702                                  * much time trying to collapse the VM objects
2703                                  * and create a bottleneck when several tasks
2704                                  * map the same file.
2705                                  */
2706                                 if (cur_object->copy == object) {
2707                                         /*
2708                                          * Shared mapping or no COW yet.
2709                                          * We can never collapse a copy
2710                                          * object into its backing object.
2711                                          */
2712                                         need_collapse = FALSE;
2713                                 } else if (cur_object->copy == object->shadow &&
2714                                            object->shadow->resident_page_count == 0) {
2715                                         /*
2716                                          * Shared mapping after a COW occurred.
2717                                          */
2718                                         need_collapse = FALSE;
2719                                 }
2720                         }
2721                         vm_object_unlock(cur_object);
2722
2723                         if (need_collapse == FALSE)
2724                                 vm_fault_collapse_skipped++;
2725                         vm_fault_collapse_total++;
2726
2727                         type_of_fault = DBG_COW_FAULT;
2728                         VM_STAT_INCR(cow_faults);
2729                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2730                         current_task()->cow_faults++;
2731
2732                         goto FastPmapEnter;
2733
2734                 } else {
2735                         /*
2736                          * No page at cur_object, cur_offset... m == NULL
2737                          */
2738                         if (cur_object->pager_created) {
2739                                 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
2740                                         /*
2741                                          * May have to talk to a pager...
2742                                          * take the slow path.
2743                                          */
2744                                         break;
2745                                 }
2746                                 /*
2747                                  * existence map present and indicates
2748                                  * that the pager doesn't have this page
2749                                  */
2750                         }
2751                         if (cur_object->shadow == VM_OBJECT_NULL) {
2752                                 /*
2753                                  * Zero fill fault.  Page gets
2754                                  * inserted into the original object.
2755                                  */
2756                                 if (cur_object->shadow_severed) {
2757
2758                                         if (object != cur_object)
2759                                                 vm_object_unlock(cur_object);
2760                                         vm_object_unlock(object);
2761
2762                                         vm_map_unlock_read(map);
2763                                         if (real_map != map)
2764                                                 vm_map_unlock(real_map);
2765
2766                                         kr = KERN_MEMORY_ERROR;
2767                                         goto done;
2768                                 }
2769                                 if (VM_PAGE_ZFILL_THROTTLED()) {
2770                                         /*
2771                                          * drop all of our locks...
2772                                          * wait until the free queue is
2773                                          * pumped back up and then
2774                                          * redrive the fault
2775                                          */
2776                                         if (object != cur_object)
2777                                                 vm_object_unlock(cur_object);
2778                                         vm_object_unlock(object);
2779                                         vm_map_unlock_read(map);
2780                                         if (real_map != map)
2781                                                 vm_map_unlock(real_map);
2782
2783                                         if (vm_page_wait((change_wiring) ?
2784                                                          THREAD_UNINT :
2785                                                          THREAD_ABORTSAFE))
2786                                                 goto RetryFault;
2787
2788                                         kr = KERN_ABORTED;
2789                                         goto done;
2790                                 }
2791                                 if (vm_backing_store_low) {
2792                                         /*
2793                                          * we are protecting the system from
2794                                          * backing store exhaustion...
2795                                          * must take the slow path if we're
2796                                          * not privileged
2797                                          */
2798                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
2799                                                 break;
2800                                 }
2801                                 if (cur_object != object) {
2802                                         vm_object_unlock(cur_object);
2803
2804                                         cur_object = object;
2805                                 }
2806                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
2807
2808                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2809
2810                                         if (vm_object_lock_upgrade(object) == FALSE) {
2811                                                 /*
2812                                                  * couldn't upgrade so do a full retry on the fault
2813                                                  * since we dropped the object lock which
2814                                                  * could allow another thread to insert
2815                                                  * a page at this offset
2816                                                  */
2817                                                 vm_map_unlock_read(map);
2818                                                 if (real_map != map)
2819                                                         vm_map_unlock(real_map);
2820
2821                                                 goto RetryFault;
2822                                         }
2823                                 }
2824                                 m = vm_page_alloc(object, offset);
2825
2826                                 if (m == VM_PAGE_NULL) {
2827                                         /*
2828                                          * no free page currently available...
2829                                          * must take the slow path
2830                                          */
2831                                         break;
2832                                 }
2833
2834                                 /*
2835                                  * Now zero fill page...
2836                                  * the page is probably going to
2837                                  * be written soon, so don't bother
2838                                  * to clear the modified bit
2839                                  *
2840                                  *   NOTE: This code holds the map
2841                                  *   lock across the zero fill.
2842                                  */
2843                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
2844
2845                                 goto FastPmapEnter;
2846                         }
2847                         /*
2848                          * On to the next level in the shadow chain
2849                          */
2850                         cur_offset += cur_object->shadow_offset;
2851                         new_object = cur_object->shadow;
2852
2853                         /*
2854                          * take the new_object's lock with the indicated state
2855                          */
2856                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
2857                                 vm_object_lock_shared(new_object);
2858                         else
2859                                 vm_object_lock(new_object);
2860
2861                         if (cur_object != object)
2862                                 vm_object_unlock(cur_object);
2863
2864                         cur_object = new_object;
2865
2866                         continue;
2867                 }
2868         }
2869         /*
2870          * Cleanup from fast fault failure.  Drop any object
2871          * lock other than original and drop map lock.
2872          */
2873         if (object != cur_object)
2874                 vm_object_unlock(cur_object);
2875
2876         /*
2877          * must own the object lock exclusively at this point
2878          */
2879         if (object_lock_type == OBJECT_LOCK_SHARED) {
2880                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2881
2882                 if (vm_object_lock_upgrade(object) == FALSE) {
2883                         /*
2884                          * couldn't upgrade, so explictly
2885                          * take the lock exclusively
2886                          * no need to retry the fault at this
2887                          * point since "vm_fault_page" will
2888                          * completely re-evaluate the state
2889                          */
2890                         vm_object_lock(object);
2891                 }
2892         }
2893
2894 handle_copy_delay:
2895         vm_map_unlock_read(map);
2896         if (real_map != map)
2897                 vm_map_unlock(real_map);
2898
2899         /*
2900          * Make a reference to this object to
2901          * prevent its disposal while we are messing with
2902          * it.  Once we have the reference, the map is free
2903          * to be diddled.  Since objects reference their
2904          * shadows (and copies), they will stay around as well.
2905          */
2906         vm_object_reference_locked(object);
2907         vm_object_paging_begin(object);
2908
2909         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2910
2911         error_code = 0;
2912
2913         kr = vm_fault_page(object, offset, fault_type,
2914                            (change_wiring && !wired),
2915                            &prot, &result_page, &top_page,
2916                            &type_of_fault,
2917                            &error_code, map->no_zero_fill,
2918                            FALSE, &fault_info);
2919
2920         /*
2921          * if kr != VM_FAULT_SUCCESS, then the paging reference
2922          * has been dropped and the object unlocked... the ref_count
2923          * is still held
2924          *
2925          * if kr == VM_FAULT_SUCCESS, then the paging reference
2926          * is still held along with the ref_count on the original object
2927          *
2928          *      if m != NULL, then the object it belongs to
2929          *      is returned locked with a paging reference
2930          *
2931          *      if top_page != NULL, then it's BUSY and the
2932          *      object it belongs to has a paging reference
2933          *      but is returned unlocked
2934          */
2935         if (kr != VM_FAULT_SUCCESS) {
2936                 /*
2937                  * we didn't succeed, lose the object reference immediately.
2938                  */
2939                 vm_object_deallocate(object);
2940
2941                 /*
2942                  * See why we failed, and take corrective action.
2943                  */
2944                 switch (kr) {
2945                 case VM_FAULT_MEMORY_SHORTAGE:
2946                         if (vm_page_wait((change_wiring) ?
2947                                          THREAD_UNINT :
2948                                          THREAD_ABORTSAFE))
2949                                 goto RetryFault;
2950                         /*
2951                          * fall thru
2952                          */
2953                 case VM_FAULT_INTERRUPTED:
2954                         kr = KERN_ABORTED;
2955                         goto done;
2956                 case VM_FAULT_RETRY:
2957                         goto RetryFault;
2958                 case VM_FAULT_MEMORY_ERROR:
2959                         if (error_code)
2960                                 kr = error_code;
2961                         else
2962                                 kr = KERN_MEMORY_ERROR;
2963                         goto done;
2964                 }
2965         }
2966         m = result_page;
2967
2968         if (m != VM_PAGE_NULL) {
2969                 assert((change_wiring && !wired) ?
2970                     (top_page == VM_PAGE_NULL) :
2971                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
2972         }
2973
2974         /*
2975          * What to do with the resulting page from vm_fault_page
2976          * if it doesn't get entered into the physical map:
2977          */
2978 #define RELEASE_PAGE(m)                                 \
2979         MACRO_BEGIN                                     \
2980         PAGE_WAKEUP_DONE(m);                            \
2981         vm_page_lockspin_queues();                      \
2982         if (!m->active && !m->inactive && !m->throttled)\
2983                 vm_page_activate(m);                    \
2984         vm_page_unlock_queues();                        \
2985         MACRO_END
2986
2987         /*
2988          * We must verify that the maps have not changed
2989          * since our last lookup.
2990          */
2991         if (m != VM_PAGE_NULL) {
2992                 old_copy_object = m->object->copy;
2993                 vm_object_unlock(m->object);
2994         } else
2995                 old_copy_object = VM_OBJECT_NULL;
2996
2997         /*
2998          * no object locks are held at this point
2999          */
3000         if ((map != original_map) || !vm_map_verify(map, &version)) {
3001                 vm_object_t             retry_object;
3002                 vm_object_offset_t      retry_offset;
3003                 vm_prot_t               retry_prot;
3004
3005                 /*
3006                  * To avoid trying to write_lock the map while another
3007                  * thread has it read_locked (in vm_map_pageable), we
3008                  * do not try for write permission.  If the page is
3009                  * still writable, we will get write permission.  If it
3010                  * is not, or has been marked needs_copy, we enter the
3011                  * mapping without write permission, and will merely
3012                  * take another fault.
3013                  */
3014                 map = original_map;
3015                 vm_map_lock_read(map);
3016
3017                 kr = vm_map_lookup_locked(&map, vaddr,
3018                                           fault_type & ~VM_PROT_WRITE,
3019                                           OBJECT_LOCK_EXCLUSIVE, &version,
3020                                           &retry_object, &retry_offset, &retry_prot,
3021                                           &wired,
3022                                           &fault_info,
3023                                           &real_map);
3024                 pmap = real_map->pmap;
3025
3026                 if (kr != KERN_SUCCESS) {
3027                         vm_map_unlock_read(map);
3028
3029                         if (m != VM_PAGE_NULL) {
3030                                 /*
3031                                  * retake the lock so that
3032                                  * we can drop the paging reference
3033                                  * in vm_fault_cleanup and do the
3034                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3035                                  */
3036                                 vm_object_lock(m->object);
3037
3038                                 RELEASE_PAGE(m);
3039
3040                                 vm_fault_cleanup(m->object, top_page);
3041                         } else {
3042                                 /*
3043                                  * retake the lock so that
3044                                  * we can drop the paging reference
3045                                  * in vm_fault_cleanup
3046                                  */
3047                                 vm_object_lock(object);
3048
3049                                 vm_fault_cleanup(object, top_page);
3050                         }
3051                         vm_object_deallocate(object);
3052
3053                         goto done;
3054                 }
3055                 vm_object_unlock(retry_object);
3056
3057                 if ((retry_object != object) || (retry_offset != offset)) {
3058
3059                         vm_map_unlock_read(map);
3060                         if (real_map != map)
3061                                 vm_map_unlock(real_map);
3062
3063                         if (m != VM_PAGE_NULL) {
3064                                 /*
3065                                  * retake the lock so that
3066                                  * we can drop the paging reference
3067                                  * in vm_fault_cleanup and do the
3068                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3069                                  */
3070                                 vm_object_lock(m->object);
3071
3072                                 RELEASE_PAGE(m);
3073
3074                                 vm_fault_cleanup(m->object, top_page);
3075                         } else {
3076                                 /*
3077                                  * retake the lock so that
3078                                  * we can drop the paging reference
3079                                  * in vm_fault_cleanup
3080                                  */
3081                                 vm_object_lock(object);
3082
3083                                 vm_fault_cleanup(object, top_page);
3084                         }
3085                         vm_object_deallocate(object);
3086
3087                         goto RetryFault;
3088                 }
3089                 /*
3090                  * Check whether the protection has changed or the object
3091                  * has been copied while we left the map unlocked.
3092                  */
3093                 prot &= retry_prot;
3094         }
3095         if (m != VM_PAGE_NULL) {
3096                 vm_object_lock(m->object);
3097
3098                 if (m->object->copy != old_copy_object) {
3099                         /*
3100                          * The copy object changed while the top-level object
3101                          * was unlocked, so take away write permission.
3102                          */
3103                         prot &= ~VM_PROT_WRITE;
3104                 }
3105         } else
3106                 vm_object_lock(object);
3107
3108         /*
3109          * If we want to wire down this page, but no longer have
3110          * adequate permissions, we must start all over.
3111          */
3112         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3113
3114                 vm_map_verify_done(map, &version);
3115                 if (real_map != map)
3116                         vm_map_unlock(real_map);
3117
3118                 if (m != VM_PAGE_NULL) {
3119                         RELEASE_PAGE(m);
3120
3121                         vm_fault_cleanup(m->object, top_page);
3122                 } else
3123                         vm_fault_cleanup(object, top_page);
3124
3125                 vm_object_deallocate(object);
3126
3127                 goto RetryFault;
3128         }
3129         if (m != VM_PAGE_NULL) {
3130                 /*
3131                  * Put this page into the physical map.
3132                  * We had to do the unlock above because pmap_enter
3133                  * may cause other faults.  The page may be on
3134                  * the pageout queues.  If the pageout daemon comes
3135                  * across the page, it will remove it from the queues.
3136                  */
3137                 if (caller_pmap) {
3138                         kr = vm_fault_enter(m,
3139                                             caller_pmap,
3140                                             caller_pmap_addr,
3141                                             prot,
3142                                             wired,
3143                                             change_wiring,
3144                                             fault_info.no_cache,
3145                                             &type_of_fault);
3146                 } else {
3147                         kr = vm_fault_enter(m,
3148                                             pmap,
3149                                             vaddr,
3150                                             prot,
3151                                             wired,
3152                                             change_wiring,
3153                                             fault_info.no_cache,
3154                                             &type_of_fault);
3155                 }
3156                 if (kr != KERN_SUCCESS) {
3157                         /* abort this page fault */
3158                         vm_map_verify_done(map, &version);
3159                         if (real_map != map)
3160                                 vm_map_unlock(real_map);
3161                         PAGE_WAKEUP_DONE(m);
3162                         vm_fault_cleanup(m->object, top_page);
3163                         vm_object_deallocate(object);
3164                         goto done;
3165                 }
3166         } else {
3167
3168                 vm_map_entry_t          entry;
3169                 vm_map_offset_t         laddr;
3170                 vm_map_offset_t         ldelta, hdelta;
3171
3172                 /*
3173                  * do a pmap block mapping from the physical address
3174                  * in the object
3175                  */
3176
3177 #ifdef ppc
3178                 /* While we do not worry about execution protection in   */
3179                 /* general, certian pages may have instruction execution */
3180                 /* disallowed.  We will check here, and if not allowed   */
3181                 /* to execute, we return with a protection failure.      */
3182
3183                 if ((fault_type & VM_PROT_EXECUTE) &&
3184                         (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) {
3185
3186                         vm_map_verify_done(map, &version);
3187
3188                         if (real_map != map)
3189                                 vm_map_unlock(real_map);
3190
3191                         vm_fault_cleanup(object, top_page);
3192                         vm_object_deallocate(object);
3193
3194                         kr = KERN_PROTECTION_FAILURE;
3195                         goto done;
3196                 }
3197 #endif  /* ppc */
3198
3199                 if (real_map != map)
3200                         vm_map_unlock(real_map);
3201
3202                 if (original_map != map) {
3203                         vm_map_unlock_read(map);
3204                         vm_map_lock_read(original_map);
3205                         map = original_map;
3206                 }
3207                 real_map = map;
3208
3209                 laddr = vaddr;
3210                 hdelta = 0xFFFFF000;
3211                 ldelta = 0xFFFFF000;
3212
3213                 while (vm_map_lookup_entry(map, laddr, &entry)) {
3214                         if (ldelta > (laddr - entry->vme_start))
3215                                 ldelta = laddr - entry->vme_start;
3216                         if (hdelta > (entry->vme_end - laddr))
3217                                 hdelta = entry->vme_end - laddr;
3218                         if (entry->is_sub_map) {
3219
3220                                 laddr = (laddr - entry->vme_start)
3221                                                         + entry->offset;
3222                                 vm_map_lock_read(entry->object.sub_map);
3223
3224                                 if (map != real_map)
3225                                         vm_map_unlock_read(map);
3226                                 if (entry->use_pmap) {
3227                                         vm_map_unlock_read(real_map);
3228                                         real_map = entry->object.sub_map;
3229                                 }
3230                                 map = entry->object.sub_map;
3231
3232                         } else {
3233                                 break;
3234                         }
3235                 }
3236
3237                 if (vm_map_lookup_entry(map, laddr, &entry) &&
3238                                         (entry->object.vm_object != NULL) &&
3239                                         (entry->object.vm_object == object)) {
3240
3241                         if (caller_pmap) {
3242                                 /*
3243                                  * Set up a block mapped area
3244                                  */
3245                                 pmap_map_block(caller_pmap,
3246                                                (addr64_t)(caller_pmap_addr - ldelta),
3247                                                (((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) +
3248                                                 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3249                                                ((ldelta + hdelta) >> 12), prot,
3250                                                (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3251                         } else {
3252                                 /*
3253                                  * Set up a block mapped area
3254                                  */
3255                                 pmap_map_block(real_map->pmap,
3256                                                (addr64_t)(vaddr - ldelta),
3257                                                (((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) +
3258                                                 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3259                                                ((ldelta + hdelta) >> 12), prot,
3260                                                (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3261                         }
3262                 }
3263         }
3264
3265         /*
3266          * Unlock everything, and return
3267          */
3268         vm_map_verify_done(map, &version);
3269         if (real_map != map)
3270                 vm_map_unlock(real_map);
3271
3272         if (m != VM_PAGE_NULL) {
3273                 PAGE_WAKEUP_DONE(m);
3274
3275                 vm_fault_cleanup(m->object, top_page);
3276         } else
3277                 vm_fault_cleanup(object, top_page);
3278
3279         vm_object_deallocate(object);
3280
3281 #undef  RELEASE_PAGE
3282
3283         kr = KERN_SUCCESS;
3284 done:
3285         thread_interrupt_level(interruptible_state);
3286
3287         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3288                               (int)((uint64_t)vaddr >> 32),
3289                               (int)vaddr,
3290                               kr,
3291                               type_of_fault,
3292                               0);
3293
3294         return (kr);
3295 }
3296
3297 /*
3298  *      vm_fault_wire:
3299  *
3300  *      Wire down a range of virtual addresses in a map.
3301  */
3302 kern_return_t
3303 vm_fault_wire(
3304         vm_map_t        map,
3305         vm_map_entry_t  entry,
3306         pmap_t          pmap,
3307         vm_map_offset_t pmap_addr)
3308 {
3309
3310         register vm_map_offset_t        va;
3311         register vm_map_offset_t        end_addr = entry->vme_end;
3312         register kern_return_t  rc;
3313
3314         assert(entry->in_transition);
3315
3316         if ((entry->object.vm_object != NULL) &&
3317                         !entry->is_sub_map &&
3318                         entry->object.vm_object->phys_contiguous) {
3319                 return KERN_SUCCESS;
3320         }
3321
3322         /*
3323          *      Inform the physical mapping system that the
3324          *      range of addresses may not fault, so that
3325          *      page tables and such can be locked down as well.
3326          */
3327
3328         pmap_pageable(pmap, pmap_addr,
3329                 pmap_addr + (end_addr - entry->vme_start), FALSE);
3330
3331         /*
3332          *      We simulate a fault to get the page and enter it
3333          *      in the physical map.
3334          */
3335
3336         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3337                 if ((rc = vm_fault_wire_fast(
3338                         map, va, entry, pmap,
3339                         pmap_addr + (va - entry->vme_start)
3340                         )) != KERN_SUCCESS) {
3341                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3342                                 (pmap == kernel_pmap) ?
3343                                         THREAD_UNINT : THREAD_ABORTSAFE,
3344                                 pmap, pmap_addr + (va - entry->vme_start));
3345                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
3346                 }
3347
3348                 if (rc != KERN_SUCCESS) {
3349                         struct vm_map_entry     tmp_entry = *entry;
3350
3351                         /* unwire wired pages */
3352                         tmp_entry.vme_end = va;
3353                         vm_fault_unwire(map,
3354                                 &tmp_entry, FALSE, pmap, pmap_addr);
3355
3356                         return rc;
3357                 }
3358         }
3359         return KERN_SUCCESS;
3360 }
3361
3362 /*
3363  *      vm_fault_unwire:
3364  *
3365  *      Unwire a range of virtual addresses in a map.
3366  */
3367 void
3368 vm_fault_unwire(
3369         vm_map_t        map,
3370         vm_map_entry_t  entry,
3371         boolean_t       deallocate,
3372         pmap_t          pmap,
3373         vm_map_offset_t pmap_addr)
3374 {
3375         register vm_map_offset_t        va;
3376         register vm_map_offset_t        end_addr = entry->vme_end;
3377         vm_object_t             object;
3378         struct vm_object_fault_info fault_info;
3379
3380         object = (entry->is_sub_map)
3381                         ? VM_OBJECT_NULL : entry->object.vm_object;
3382
3383         /*
3384          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
3385          * do anything since such memory is wired by default.  So we don't have
3386          * anything to undo here.
3387          */
3388
3389         if (object != VM_OBJECT_NULL && object->phys_contiguous)
3390                 return;
3391
3392         fault_info.interruptible = THREAD_UNINT;
3393         fault_info.behavior = entry->behavior;
3394         fault_info.user_tag = entry->alias;
3395         fault_info.lo_offset = entry->offset;
3396         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
3397         fault_info.no_cache = entry->no_cache;
3398
3399         /*
3400          *      Since the pages are wired down, we must be able to
3401          *      get their mappings from the physical map system.
3402          */
3403
3404         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3405
3406                 if (pmap) {
3407                         pmap_change_wiring(pmap,
3408                                            pmap_addr + (va - entry->vme_start), FALSE);
3409                 }
3410                 if (object == VM_OBJECT_NULL) {
3411                         (void) vm_fault(map, va, VM_PROT_NONE,
3412                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
3413                 } else {
3414                         vm_prot_t       prot;
3415                         vm_page_t       result_page;
3416                         vm_page_t       top_page;
3417                         vm_object_t     result_object;
3418                         vm_fault_return_t result;
3419
3420                         fault_info.cluster_size = end_addr - va;
3421
3422                         do {
3423                                 prot = VM_PROT_NONE;
3424
3425                                 vm_object_lock(object);
3426                                 vm_object_paging_begin(object);
3427                                 XPR(XPR_VM_FAULT,
3428                                         "vm_fault_unwire -> vm_fault_page\n",
3429                                         0,0,0,0,0);
3430                                 result = vm_fault_page(
3431                                         object,
3432                                         entry->offset + (va - entry->vme_start),
3433                                         VM_PROT_NONE, TRUE,
3434                                         &prot, &result_page, &top_page,
3435                                         (int *)0,
3436                                         NULL, map->no_zero_fill,
3437                                         FALSE, &fault_info);
3438                         } while (result == VM_FAULT_RETRY);
3439
3440                         /*
3441                          * If this was a mapping to a file on a device that has been forcibly
3442                          * unmounted, then we won't get a page back from vm_fault_page().  Just
3443                          * move on to the next one in case the remaining pages are mapped from
3444                          * different objects.  During a forced unmount, the object is terminated
3445                          * so the alive flag will be false if this happens.  A forced unmount will
3446                          * will occur when an external disk is unplugged before the user does an
3447                          * eject, so we don't want to panic in that situation.
3448                          */
3449
3450                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
3451                                 continue;
3452
3453                         if (result != VM_FAULT_SUCCESS)
3454                                 panic("vm_fault_unwire: failure");
3455
3456                         result_object = result_page->object;
3457
3458                         if (deallocate) {
3459                                 assert(result_page->phys_page !=
3460                                        vm_page_fictitious_addr);
3461                                 pmap_disconnect(result_page->phys_page);
3462                                 VM_PAGE_FREE(result_page);
3463                         } else {
3464                                 vm_page_lockspin_queues();
3465                                 vm_page_unwire(result_page);
3466                                 vm_page_unlock_queues();
3467                                 PAGE_WAKEUP_DONE(result_page);
3468                         }
3469                         vm_fault_cleanup(result_object, top_page);
3470                 }
3471         }
3472
3473         /*
3474          *      Inform the physical mapping system that the range
3475          *      of addresses may fault, so that page tables and
3476          *      such may be unwired themselves.
3477          */
3478
3479         pmap_pageable(pmap, pmap_addr,
3480                 pmap_addr + (end_addr - entry->vme_start), TRUE);
3481
3482 }
3483
3484 /*
3485  *      vm_fault_wire_fast:
3486  *
3487  *      Handle common case of a wire down page fault at the given address.
3488  *      If successful, the page is inserted into the associated physical map.
3489  *      The map entry is passed in to avoid the overhead of a map lookup.
3490  *
3491  *      NOTE: the given address should be truncated to the
3492  *      proper page address.
3493  *
3494  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
3495  *      a standard error specifying why the fault is fatal is returned.
3496  *
3497  *      The map in question must be referenced, and remains so.
3498  *      Caller has a read lock on the map.
3499  *
3500  *      This is a stripped version of vm_fault() for wiring pages.  Anything
3501  *      other than the common case will return KERN_FAILURE, and the caller
3502  *      is expected to call vm_fault().
3503  */
3504 kern_return_t
3505 vm_fault_wire_fast(
3506         __unused vm_map_t       map,
3507         vm_map_offset_t va,
3508         vm_map_entry_t  entry,
3509         pmap_t                  pmap,
3510         vm_map_offset_t pmap_addr)
3511 {
3512         vm_object_t             object;
3513         vm_object_offset_t      offset;
3514         register vm_page_t      m;
3515         vm_prot_t               prot;
3516         thread_t                thread = current_thread();
3517         int                     type_of_fault;
3518         kern_return_t           kr;
3519
3520         VM_STAT_INCR(faults);
3521
3522         if (thread != THREAD_NULL && thread->task != TASK_NULL)
3523           thread->task->faults++;
3524
3525 /*
3526  *      Recovery actions
3527  */
3528
3529 #undef  RELEASE_PAGE
3530 #define RELEASE_PAGE(m) {                               \
3531         PAGE_WAKEUP_DONE(m);                            \
3532         vm_page_lockspin_queues();                      \
3533         vm_page_unwire(m);                              \
3534         vm_page_unlock_queues();                        \
3535 }
3536
3537
3538 #undef  UNLOCK_THINGS
3539 #define UNLOCK_THINGS   {                               \
3540         vm_object_paging_end(object);                      \
3541         vm_object_unlock(object);                          \
3542 }
3543
3544 #undef  UNLOCK_AND_DEALLOCATE
3545 #define UNLOCK_AND_DEALLOCATE   {                       \
3546         UNLOCK_THINGS;                                  \
3547         vm_object_deallocate(object);                   \
3548 }
3549 /*
3550  *      Give up and have caller do things the hard way.
3551  */
3552
3553 #define GIVE_UP {                                       \
3554         UNLOCK_AND_DEALLOCATE;                          \
3555         return(KERN_FAILURE);                           \
3556 }
3557
3558
3559         /*
3560          *      If this entry is not directly to a vm_object, bail out.
3561          */
3562         if (entry->is_sub_map)
3563                 return(KERN_FAILURE);
3564
3565         /*
3566          *      Find the backing store object and offset into it.
3567          */
3568
3569         object = entry->object.vm_object;
3570         offset = (va - entry->vme_start) + entry->offset;
3571         prot = entry->protection;
3572
3573         /*
3574          *      Make a reference to this object to prevent its
3575          *      disposal while we are messing with it.
3576          */
3577
3578         vm_object_lock(object);
3579         vm_object_reference_locked(object);
3580         vm_object_paging_begin(object);
3581
3582         /*
3583          *      INVARIANTS (through entire routine):
3584          *
3585          *      1)      At all times, we must either have the object
3586          *              lock or a busy page in some object to prevent
3587          *              some other thread from trying to bring in
3588          *              the same page.
3589          *
3590          *      2)      Once we have a busy page, we must remove it from
3591          *              the pageout queues, so that the pageout daemon
3592          *              will not grab it away.
3593          *
3594          */
3595
3596         /*
3597          *      Look for page in top-level object.  If it's not there or
3598          *      there's something going on, give up.
3599          * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3600          * decrypt the page before wiring it down.
3601          */
3602         m = vm_page_lookup(object, offset);
3603         if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
3604             (m->unusual && ( m->error || m->restart || m->absent))) {
3605
3606                 GIVE_UP;
3607         }
3608         ASSERT_PAGE_DECRYPTED(m);
3609
3610         if (m->fictitious &&
3611             m->phys_page == vm_page_guard_addr) {
3612                 /*
3613                  * Guard pages are fictitious pages and are never
3614                  * entered into a pmap, so let's say it's been wired...
3615                  */
3616                 kr = KERN_SUCCESS;
3617                 goto done;
3618         }
3619
3620         /*
3621          *      Wire the page down now.  All bail outs beyond this
3622          *      point must unwire the page.
3623          */
3624
3625         vm_page_lockspin_queues();
3626         vm_page_wire(m);
3627         vm_page_unlock_queues();
3628
3629         /*
3630          *      Mark page busy for other threads.
3631          */
3632         assert(!m->busy);
3633         m->busy = TRUE;
3634         assert(!m->absent);
3635
3636         /*
3637          *      Give up if the page is being written and there's a copy object
3638          */
3639         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3640                 RELEASE_PAGE(m);
3641                 GIVE_UP;
3642         }
3643
3644         /*
3645          *      Put this page into the physical map.
3646          */
3647         type_of_fault = DBG_CACHE_HIT_FAULT;
3648         kr = vm_fault_enter(m,
3649                             pmap,
3650                             pmap_addr,
3651                             prot,
3652                             TRUE,
3653                             FALSE,
3654                             FALSE,
3655                             &type_of_fault);
3656
3657 done:
3658         /*
3659          *      Unlock everything, and return
3660          */
3661
3662         PAGE_WAKEUP_DONE(m);
3663         UNLOCK_AND_DEALLOCATE;
3664
3665         return kr;
3666
3667 }
3668
3669 /*
3670  *      Routine:        vm_fault_copy_cleanup
3671  *      Purpose:
3672  *              Release a page used by vm_fault_copy.
3673  */
3674
3675 void
3676 vm_fault_copy_cleanup(
3677         vm_page_t       page,
3678         vm_page_t       top_page)
3679 {
3680         vm_object_t     object = page->object;
3681
3682         vm_object_lock(object);
3683         PAGE_WAKEUP_DONE(page);
3684         vm_page_lockspin_queues();
3685         if (!page->active && !page->inactive && !page->throttled)
3686                 vm_page_activate(page);
3687         vm_page_unlock_queues();
3688         vm_fault_cleanup(object, top_page);
3689 }
3690
3691 void
3692 vm_fault_copy_dst_cleanup(
3693         vm_page_t       page)
3694 {
3695         vm_object_t     object;
3696
3697         if (page != VM_PAGE_NULL) {
3698                 object = page->object;
3699                 vm_object_lock(object);
3700                 vm_page_lockspin_queues();
3701                 vm_page_unwire(page);
3702                 vm_page_unlock_queues();
3703                 vm_object_paging_end(object);
3704                 vm_object_unlock(object);
3705         }
3706 }
3707
3708 /*
3709  *      Routine:        vm_fault_copy
3710  *
3711  *      Purpose:
3712  *              Copy pages from one virtual memory object to another --
3713  *              neither the source nor destination pages need be resident.
3714  *
3715  *              Before actually copying a page, the version associated with
3716  *              the destination address map wil be verified.
3717  *
3718  *      In/out conditions:
3719  *              The caller must hold a reference, but not a lock, to
3720  *              each of the source and destination objects and to the
3721  *              destination map.
3722  *
3723  *      Results:
3724  *              Returns KERN_SUCCESS if no errors were encountered in
3725  *              reading or writing the data.  Returns KERN_INTERRUPTED if
3726  *              the operation was interrupted (only possible if the
3727  *              "interruptible" argument is asserted).  Other return values
3728  *              indicate a permanent error in copying the data.
3729  *
3730  *              The actual amount of data copied will be returned in the
3731  *              "copy_size" argument.  In the event that the destination map
3732  *              verification failed, this amount may be less than the amount
3733  *              requested.
3734  */
3735 kern_return_t
3736 vm_fault_copy(
3737         vm_object_t             src_object,
3738         vm_object_offset_t      src_offset,
3739         vm_map_size_t           *copy_size,             /* INOUT */
3740         vm_object_t             dst_object,
3741         vm_object_offset_t      dst_offset,
3742         vm_map_t                dst_map,
3743         vm_map_version_t         *dst_version,
3744         int                     interruptible)
3745 {
3746         vm_page_t               result_page;
3747
3748         vm_page_t               src_page;
3749         vm_page_t               src_top_page;
3750         vm_prot_t               src_prot;
3751
3752         vm_page_t               dst_page;
3753         vm_page_t               dst_top_page;
3754         vm_prot_t               dst_prot;
3755
3756         vm_map_size_t           amount_left;
3757         vm_object_t             old_copy_object;
3758         kern_return_t           error = 0;
3759
3760         vm_map_size_t           part_size;
3761         struct vm_object_fault_info fault_info_src;
3762         struct vm_object_fault_info fault_info_dst;
3763
3764         /*
3765          * In order not to confuse the clustered pageins, align
3766          * the different offsets on a page boundary.
3767          */
3768
3769 #define RETURN(x)                                       \
3770         MACRO_BEGIN                                     \
3771         *copy_size -= amount_left;                      \
3772         MACRO_RETURN(x);                                \
3773         MACRO_END
3774
3775         amount_left = *copy_size;
3776
3777         fault_info_src.interruptible = interruptible;
3778         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
3779         fault_info_src.user_tag  = 0;
3780         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
3781         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
3782         fault_info_src.no_cache   = FALSE;
3783
3784         fault_info_dst.interruptible = interruptible;
3785         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
3786         fault_info_dst.user_tag  = 0;
3787         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
3788         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
3789         fault_info_dst.no_cache   = FALSE;
3790
3791         do { /* while (amount_left > 0) */
3792                 /*
3793                  * There may be a deadlock if both source and destination
3794                  * pages are the same. To avoid this deadlock, the copy must
3795                  * start by getting the destination page in order to apply
3796                  * COW semantics if any.
3797                  */
3798
3799         RetryDestinationFault: ;
3800
3801                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3802
3803                 vm_object_lock(dst_object);
3804                 vm_object_paging_begin(dst_object);
3805
3806                 fault_info_dst.cluster_size = amount_left;
3807
3808                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3809                 switch (vm_fault_page(dst_object,
3810                                       vm_object_trunc_page(dst_offset),
3811                                       VM_PROT_WRITE|VM_PROT_READ,
3812                                       FALSE,
3813                                       &dst_prot, &dst_page, &dst_top_page,
3814                                       (int *)0,
3815                                       &error,
3816                                       dst_map->no_zero_fill,
3817                                       FALSE, &fault_info_dst)) {
3818                 case VM_FAULT_SUCCESS:
3819                         break;
3820                 case VM_FAULT_RETRY:
3821                         goto RetryDestinationFault;
3822                 case VM_FAULT_MEMORY_SHORTAGE:
3823                         if (vm_page_wait(interruptible))
3824                                 goto RetryDestinationFault;
3825                         /* fall thru */
3826                 case VM_FAULT_INTERRUPTED:
3827                         RETURN(MACH_SEND_INTERRUPTED);
3828                 case VM_FAULT_MEMORY_ERROR:
3829                         if (error)
3830                                 return (error);
3831                         else
3832                                 return(KERN_MEMORY_ERROR);
3833                 }
3834                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3835
3836                 old_copy_object = dst_page->object->copy;
3837
3838                 /*
3839                  * There exists the possiblity that the source and
3840                  * destination page are the same.  But we can't
3841                  * easily determine that now.  If they are the
3842                  * same, the call to vm_fault_page() for the
3843                  * destination page will deadlock.  To prevent this we
3844                  * wire the page so we can drop busy without having
3845                  * the page daemon steal the page.  We clean up the
3846                  * top page  but keep the paging reference on the object
3847                  * holding the dest page so it doesn't go away.
3848                  */
3849
3850                 vm_page_lockspin_queues();
3851                 vm_page_wire(dst_page);
3852                 vm_page_unlock_queues();
3853                 PAGE_WAKEUP_DONE(dst_page);
3854                 vm_object_unlock(dst_page->object);
3855
3856                 if (dst_top_page != VM_PAGE_NULL) {
3857                         vm_object_lock(dst_object);
3858                         VM_PAGE_FREE(dst_top_page);
3859                         vm_object_paging_end(dst_object);
3860                         vm_object_unlock(dst_object);
3861                 }
3862
3863         RetrySourceFault: ;
3864
3865                 if (src_object == VM_OBJECT_NULL) {
3866                         /*
3867                          *      No source object.  We will just
3868                          *      zero-fill the page in dst_object.
3869                          */
3870                         src_page = VM_PAGE_NULL;
3871                         result_page = VM_PAGE_NULL;
3872                 } else {
3873                         vm_object_lock(src_object);
3874                         src_page = vm_page_lookup(src_object,
3875                                                   vm_object_trunc_page(src_offset));
3876                         if (src_page == dst_page) {
3877                                 src_prot = dst_prot;
3878                                 result_page = VM_PAGE_NULL;
3879                         } else {
3880                                 src_prot = VM_PROT_READ;
3881                                 vm_object_paging_begin(src_object);
3882
3883                                 fault_info_src.cluster_size = amount_left;
3884
3885                                 XPR(XPR_VM_FAULT,
3886                                         "vm_fault_copy(2) -> vm_fault_page\n",
3887                                         0,0,0,0,0);
3888                                 switch (vm_fault_page(
3889                                                 src_object,
3890                                                 vm_object_trunc_page(src_offset),
3891                                                 VM_PROT_READ, FALSE,
3892                                                 &src_prot,
3893                                                 &result_page, &src_top_page,
3894                                                 (int *)0, &error, FALSE,
3895                                                 FALSE, &fault_info_src)) {
3896
3897                                 case VM_FAULT_SUCCESS:
3898                                         break;
3899                                 case VM_FAULT_RETRY:
3900                                         goto RetrySourceFault;
3901                                 case VM_FAULT_MEMORY_SHORTAGE:
3902                                         if (vm_page_wait(interruptible))
3903                                                 goto RetrySourceFault;
3904                                         /* fall thru */
3905                                 case VM_FAULT_INTERRUPTED:
3906                                         vm_fault_copy_dst_cleanup(dst_page);
3907                                         RETURN(MACH_SEND_INTERRUPTED);
3908                                 case VM_FAULT_MEMORY_ERROR:
3909                                         vm_fault_copy_dst_cleanup(dst_page);
3910                                         if (error)
3911                                                 return (error);
3912                                         else
3913                                                 return(KERN_MEMORY_ERROR);
3914                                 }
3915
3916
3917                                 assert((src_top_page == VM_PAGE_NULL) ==
3918                                        (result_page->object == src_object));
3919                         }
3920                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3921                         vm_object_unlock(result_page->object);
3922                 }
3923
3924                 if (!vm_map_verify(dst_map, dst_version)) {
3925                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3926                                 vm_fault_copy_cleanup(result_page, src_top_page);
3927                         vm_fault_copy_dst_cleanup(dst_page);
3928                         break;
3929                 }
3930
3931                 vm_object_lock(dst_page->object);
3932
3933                 if (dst_page->object->copy != old_copy_object) {
3934                         vm_object_unlock(dst_page->object);
3935                         vm_map_verify_done(dst_map, dst_version);
3936                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3937                                 vm_fault_copy_cleanup(result_page, src_top_page);
3938                         vm_fault_copy_dst_cleanup(dst_page);
3939                         break;
3940                 }
3941                 vm_object_unlock(dst_page->object);
3942
3943                 /*
3944                  *      Copy the page, and note that it is dirty
3945                  *      immediately.
3946                  */
3947
3948                 if (!page_aligned(src_offset) ||
3949                         !page_aligned(dst_offset) ||
3950                         !page_aligned(amount_left)) {
3951
3952                         vm_object_offset_t      src_po,
3953                                                 dst_po;
3954
3955                         src_po = src_offset - vm_object_trunc_page(src_offset);
3956                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
3957
3958                         if (dst_po > src_po) {
3959                                 part_size = PAGE_SIZE - dst_po;
3960                         } else {
3961                                 part_size = PAGE_SIZE - src_po;
3962                         }
3963                         if (part_size > (amount_left)){
3964                                 part_size = amount_left;
3965                         }
3966
3967                         if (result_page == VM_PAGE_NULL) {
3968                                 vm_page_part_zero_fill(dst_page,
3969                                                         dst_po, part_size);
3970                         } else {
3971                                 vm_page_part_copy(result_page, src_po,
3972                                         dst_page, dst_po, part_size);
3973                                 if(!dst_page->dirty){
3974                                         vm_object_lock(dst_object);
3975                                         dst_page->dirty = TRUE;
3976                                         vm_object_unlock(dst_page->object);
3977                                 }
3978
3979                         }
3980                 } else {
3981                         part_size = PAGE_SIZE;
3982
3983                         if (result_page == VM_PAGE_NULL)
3984                                 vm_page_zero_fill(dst_page);
3985                         else{
3986                                 vm_page_copy(result_page, dst_page);
3987                                 if(!dst_page->dirty){
3988                                         vm_object_lock(dst_object);
3989                                         dst_page->dirty = TRUE;
3990                                         vm_object_unlock(dst_page->object);
3991                                 }
3992                         }
3993
3994                 }
3995
3996                 /*
3997                  *      Unlock everything, and return
3998                  */
3999
4000                 vm_map_verify_done(dst_map, dst_version);
4001
4002                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4003                         vm_fault_copy_cleanup(result_page, src_top_page);
4004                 vm_fault_copy_dst_cleanup(dst_page);
4005
4006                 amount_left -= part_size;
4007                 src_offset += part_size;
4008                 dst_offset += part_size;
4009         } while (amount_left > 0);
4010
4011         RETURN(KERN_SUCCESS);
4012 #undef  RETURN
4013
4014         /*NOTREACHED*/
4015 }
4016
4017 #if     VM_FAULT_CLASSIFY
4018 /*
4019  *      Temporary statistics gathering support.
4020  */
4021
4022 /*
4023  *      Statistics arrays:
4024  */
4025 #define VM_FAULT_TYPES_MAX      5
4026 #define VM_FAULT_LEVEL_MAX      8
4027
4028 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4029
4030 #define VM_FAULT_TYPE_ZERO_FILL 0
4031 #define VM_FAULT_TYPE_MAP_IN    1
4032 #define VM_FAULT_TYPE_PAGER     2
4033 #define VM_FAULT_TYPE_COPY      3
4034 #define VM_FAULT_TYPE_OTHER     4
4035
4036
4037 void
4038 vm_fault_classify(vm_object_t           object,
4039                   vm_object_offset_t    offset,
4040                   vm_prot_t             fault_type)
4041 {
4042         int             type, level = 0;
4043         vm_page_t       m;
4044
4045         while (TRUE) {
4046                 m = vm_page_lookup(object, offset);
4047                 if (m != VM_PAGE_NULL) {
4048                         if (m->busy || m->error || m->restart || m->absent) {
4049                                 type = VM_FAULT_TYPE_OTHER;
4050                                 break;
4051                         }
4052                         if (((fault_type & VM_PROT_WRITE) == 0) ||
4053                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4054                                 type = VM_FAULT_TYPE_MAP_IN;
4055                                 break;
4056                         }
4057                         type = VM_FAULT_TYPE_COPY;
4058                         break;
4059                 }
4060                 else {
4061                         if (object->pager_created) {
4062                                 type = VM_FAULT_TYPE_PAGER;
4063                                 break;
4064                         }
4065                         if (object->shadow == VM_OBJECT_NULL) {
4066                                 type = VM_FAULT_TYPE_ZERO_FILL;
4067                                 break;
4068                         }
4069
4070                         offset += object->shadow_offset;
4071                         object = object->shadow;
4072                         level++;
4073                         continue;
4074                 }
4075         }
4076
4077         if (level > VM_FAULT_LEVEL_MAX)
4078                 level = VM_FAULT_LEVEL_MAX;
4079
4080         vm_fault_stats[type][level] += 1;
4081
4082         return;
4083 }
4084
4085 /* cleanup routine to call from debugger */
4086
4087 void
4088 vm_fault_classify_init(void)
4089 {
4090         int type, level;
4091
4092         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4093                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4094                         vm_fault_stats[type][level] = 0;
4095                 }
4096         }
4097
4098         return;
4099 }
4100 #endif  /* VM_FAULT_CLASSIFY */
4101
4102
4103 extern int cs_validation;
4104
4105 void
4106 vm_page_validate_cs(
4107         vm_page_t       page)
4108 {
4109         vm_object_t             object;
4110         vm_object_offset_t      offset;
4111         vm_map_offset_t         koffset;
4112         vm_map_size_t           ksize;
4113         vm_offset_t             kaddr;
4114         kern_return_t           kr;
4115         memory_object_t         pager;
4116         void                    *blobs;
4117         boolean_t               validated, tainted;
4118         boolean_t               busy_page;
4119
4120         vm_object_lock_assert_exclusive(page->object);
4121         assert(!page->cs_validated);
4122
4123         if (!cs_validation) {
4124                 return;
4125         }
4126
4127         object = page->object;
4128         assert(object->code_signed);
4129         offset = page->offset;
4130
4131         busy_page = page->busy;
4132         if (!busy_page) {
4133                 /* keep page busy while we map (and unlock) the VM object */
4134                 page->busy = TRUE;
4135         }
4136
4137         /*
4138          * Take a paging reference on the VM object
4139          * to protect it from collapse or bypass,
4140          * and keep it from disappearing too.
4141          */
4142         vm_object_paging_begin(object);
4143
4144         /* map the page in the kernel address space */
4145         koffset = 0;
4146         ksize = PAGE_SIZE_64;
4147         kr = vm_paging_map_object(&koffset,
4148                                   page,
4149                                   object,
4150                                   offset,
4151                                   &ksize,
4152                                   FALSE); /* can't unlock object ! */
4153         if (kr != KERN_SUCCESS) {
4154                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
4155         }
4156         kaddr = CAST_DOWN(vm_offset_t, koffset);
4157
4158         /*
4159          * Since we get here to validate a page that was brought in by
4160          * the pager, we know that this pager is all setup and ready
4161          * by now.
4162          */
4163         assert(!object->internal);
4164         assert(object->pager != NULL);
4165         assert(object->pager_ready);
4166
4167         if (!object->alive || object->terminating || object->pager == NULL) {
4168                 /*
4169                  * The object is terminating and we don't have its pager
4170                  * so we can't validate the data...
4171                  */
4172                 goto out;
4173         }
4174
4175         pager = object->pager;
4176         assert(pager != NULL);
4177
4178         kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
4179         if (kr != KERN_SUCCESS) {
4180                 blobs = NULL;
4181         }
4182
4183         /* verify the SHA1 hash for this page */
4184         validated = cs_validate_page(blobs,
4185                                      offset + object->paging_offset,
4186                                      (const void *)kaddr,
4187                                      &tainted);
4188
4189         assert(page->busy);
4190         assert(object == page->object);
4191         vm_object_lock_assert_exclusive(object);
4192
4193         page->cs_validated = validated;
4194         if (validated) {
4195                 page->cs_tainted = tainted;
4196         }
4197
4198 out:
4199         if (!busy_page) {
4200                 PAGE_WAKEUP_DONE(page);
4201         }
4202         if (koffset != 0) {
4203                 /* unmap the map from the kernel address space */
4204                 vm_paging_unmap_object(object, koffset, koffset + ksize);
4205                 koffset = 0;
4206                 ksize = 0;
4207                 kaddr = 0;
4208         }
4209         vm_object_paging_end(object);
4210 }