osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <mach_kdb.h>
  68 #include <libkern/OSAtomic.h>
  69
  70 #include <mach/mach_types.h>
  71 #include <mach/kern_return.h>
  72 #include <mach/message.h>       /* for error codes */
  73 #include <mach/vm_param.h>
  74 #include <mach/vm_behavior.h>
  75 #include <mach/memory_object.h>
  76                                 /* For memory_object_data_{request,unlock} */
  77 #include <mach/sdt.h>
  78
  79 #include <kern/kern_types.h>
  80 #include <kern/host_statistics.h>
  81 #include <kern/counters.h>
  82 #include <kern/task.h>
  83 #include <kern/thread.h>
  84 #include <kern/sched_prim.h>
  85 #include <kern/host.h>
  86 #include <kern/xpr.h>
  87 #include <kern/mach_param.h>
  88 #include <kern/macro_help.h>
  89 #include <kern/zalloc.h>
  90 #include <kern/misc_protos.h>
  91
  92 #include <ppc/proc_reg.h>
  93
  94 #include <vm/vm_fault.h>
  95 #include <vm/vm_map.h>
  96 #include <vm/vm_object.h>
  97 #include <vm/vm_page.h>
  98 #include <vm/vm_kern.h>
  99 #include <vm/pmap.h>
 100 #include <vm/vm_pageout.h>
 101 #include <vm/vm_protos.h>
 102 #include <vm/vm_external.h>
 103 #include <vm/memory_object.h>
 104 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 105
 106 #include <sys/kdebug.h>
 107
 108 #define VM_FAULT_CLASSIFY       0
 109
 110 /* Zero-filled pages are marked "m->zero_fill" and put on the
 111  * special zero-fill inactive queue  only if they belong to
 112  * an object at least this big.
 113  */
 114 #define VM_ZF_OBJECT_SIZE_THRESHOLD     (0x200000)
 115
 116 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 117
 118 int     vm_object_pagein_throttle = 16;
 119
 120 extern int cs_debug;
 121
 122 #if     MACH_KDB
 123 extern struct db_watchpoint *db_watchpoint_list;
 124 #endif  /* MACH_KDB */
 125
 126
 127 /* Forward declarations of internal routines. */
 128 extern kern_return_t vm_fault_wire_fast(
 129                                 vm_map_t        map,
 130                                 vm_map_offset_t va,
 131                                 vm_map_entry_t  entry,
 132                                 pmap_t          pmap,
 133                                 vm_map_offset_t pmap_addr);
 134
 135 extern void vm_fault_continue(void);
 136
 137 extern void vm_fault_copy_cleanup(
 138                                 vm_page_t       page,
 139                                 vm_page_t       top_page);
 140
 141 extern void vm_fault_copy_dst_cleanup(
 142                                 vm_page_t       page);
 143
 144 #if     VM_FAULT_CLASSIFY
 145 extern void vm_fault_classify(vm_object_t       object,
 146                           vm_object_offset_t    offset,
 147                           vm_prot_t             fault_type);
 148
 149 extern void vm_fault_classify_init(void);
 150 #endif
 151
 152
 153 unsigned long vm_cs_validates = 0;
 154 unsigned long vm_cs_revalidates = 0;
 155 unsigned long vm_cs_query_modified = 0;
 156 unsigned long vm_cs_validated_dirtied = 0;
 157
 158 #if CONFIG_ENFORCE_SIGNED_CODE
 159 #if SECURE_KERNEL
 160 const int cs_enforcement_disable=0;
 161 #else
 162 int cs_enforcement_disable=1;
 163 #endif
 164 #endif
 165
 166 /*
 167  *      Routine:        vm_fault_init
 168  *      Purpose:
 169  *              Initialize our private data structures.
 170  */
 171 void
 172 vm_fault_init(void)
 173 {
 174 #if !SECURE_KERNEL
 175 #if CONFIG_ENFORCE_SIGNED_CODE
 176         PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable, sizeof (cs_enforcement_disable));
 177 #endif
 178         PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug));
 179 #endif
 180 }
 181
 182 /*
 183  *      Routine:        vm_fault_cleanup
 184  *      Purpose:
 185  *              Clean up the result of vm_fault_page.
 186  *      Results:
 187  *              The paging reference for "object" is released.
 188  *              "object" is unlocked.
 189  *              If "top_page" is not null,  "top_page" is
 190  *              freed and the paging reference for the object
 191  *              containing it is released.
 192  *
 193  *      In/out conditions:
 194  *              "object" must be locked.
 195  */
 196 void
 197 vm_fault_cleanup(
 198         register vm_object_t    object,
 199         register vm_page_t      top_page)
 200 {
 201         vm_object_paging_end(object);
 202         vm_object_unlock(object);
 203
 204         if (top_page != VM_PAGE_NULL) {
 205                 object = top_page->object;
 206
 207                 vm_object_lock(object);
 208                 VM_PAGE_FREE(top_page);
 209                 vm_object_paging_end(object);
 210                 vm_object_unlock(object);
 211         }
 212 }
 213
 214 #if     MACH_CLUSTER_STATS
 215 #define MAXCLUSTERPAGES 16
 216 struct {
 217         unsigned long pages_in_cluster;
 218         unsigned long pages_at_higher_offsets;
 219         unsigned long pages_at_lower_offsets;
 220 } cluster_stats_in[MAXCLUSTERPAGES];
 221 #define CLUSTER_STAT(clause)    clause
 222 #define CLUSTER_STAT_HIGHER(x)  \
 223         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 224 #define CLUSTER_STAT_LOWER(x)   \
 225          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 226 #define CLUSTER_STAT_CLUSTER(x) \
 227         ((cluster_stats_in[(x)].pages_in_cluster)++)
 228 #else   /* MACH_CLUSTER_STATS */
 229 #define CLUSTER_STAT(clause)
 230 #endif  /* MACH_CLUSTER_STATS */
 231
 232 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 233
 234
 235 boolean_t       vm_page_deactivate_behind = TRUE;
 236 /*
 237  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 238  */
 239 int vm_default_ahead = 0;
 240 int vm_default_behind = MAX_UPL_TRANSFER;
 241
 242 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 243
 244 /*
 245  * vm_page_is_sequential
 246  *
 247  * Determine if sequential access is in progress
 248  * in accordance with the behavior specified.
 249  * Update state to indicate current access pattern.
 250  *
 251  * object must have at least the shared lock held
 252  */
 253 static
 254 void
 255 vm_fault_is_sequential(
 256         vm_object_t             object,
 257         vm_object_offset_t      offset,
 258         vm_behavior_t           behavior)
 259 {
 260         vm_object_offset_t      last_alloc;
 261         int                     sequential;
 262         int                     orig_sequential;
 263
 264         last_alloc = object->last_alloc;
 265         sequential = object->sequential;
 266         orig_sequential = sequential;
 267
 268         switch (behavior) {
 269         case VM_BEHAVIOR_RANDOM:
 270                 /*
 271                  * reset indicator of sequential behavior
 272                  */
 273                 sequential = 0;
 274                 break;
 275
 276         case VM_BEHAVIOR_SEQUENTIAL:
 277                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 278                         /*
 279                          * advance indicator of sequential behavior
 280                          */
 281                         if (sequential < MAX_SEQUENTIAL_RUN)
 282                                 sequential += PAGE_SIZE;
 283                 } else {
 284                         /*
 285                          * reset indicator of sequential behavior
 286                          */
 287                         sequential = 0;
 288                 }
 289                 break;
 290
 291         case VM_BEHAVIOR_RSEQNTL:
 292                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 293                         /*
 294                          * advance indicator of sequential behavior
 295                          */
 296                         if (sequential > -MAX_SEQUENTIAL_RUN)
 297                                 sequential -= PAGE_SIZE;
 298                 } else {
 299                         /*
 300                          * reset indicator of sequential behavior
 301                          */
 302                         sequential = 0;
 303                 }
 304                 break;
 305
 306         case VM_BEHAVIOR_DEFAULT:
 307         default:
 308                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 309                         /*
 310                          * advance indicator of sequential behavior
 311                          */
 312                         if (sequential < 0)
 313                                 sequential = 0;
 314                         if (sequential < MAX_SEQUENTIAL_RUN)
 315                                 sequential += PAGE_SIZE;
 316
 317                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 318                         /*
 319                          * advance indicator of sequential behavior
 320                          */
 321                         if (sequential > 0)
 322                                 sequential = 0;
 323                         if (sequential > -MAX_SEQUENTIAL_RUN)
 324                                 sequential -= PAGE_SIZE;
 325                 } else {
 326                         /*
 327                          * reset indicator of sequential behavior
 328                          */
 329                         sequential = 0;
 330                 }
 331                 break;
 332         }
 333         if (sequential != orig_sequential) {
 334                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 335                         /*
 336                          * if someone else has already updated object->sequential
 337                          * don't bother trying to update it or object->last_alloc
 338                          */
 339                         return;
 340                 }
 341         }
 342         /*
 343          * I'd like to do this with a OSCompareAndSwap64, but that
 344          * doesn't exist for PPC...  however, it shouldn't matter
 345          * that much... last_alloc is maintained so that we can determine
 346          * if a sequential access pattern is taking place... if only
 347          * one thread is banging on this object, no problem with the unprotected
 348          * update... if 2 or more threads are banging away, we run the risk of
 349          * someone seeing a mangled update... however, in the face of multiple
 350          * accesses, no sequential access pattern can develop anyway, so we
 351          * haven't lost any real info.
 352          */
 353         object->last_alloc = offset;
 354 }
 355
 356
 357 /*
 358  * vm_page_deactivate_behind
 359  *
 360  * Determine if sequential access is in progress
 361  * in accordance with the behavior specified.  If
 362  * so, compute a potential page to deactivate and
 363  * deactivate it.
 364  *
 365  * object must be locked.
 366  *
 367  * return TRUE if we actually deactivate a page
 368  */
 369 static
 370 boolean_t
 371 vm_fault_deactivate_behind(
 372         vm_object_t             object,
 373         vm_object_offset_t      offset,
 374         vm_behavior_t           behavior)
 375 {
 376         vm_page_t       m = NULL;
 377         int             sequential_run;
 378         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 379
 380 #if TRACEFAULTPAGE
 381         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 382 #endif
 383
 384         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 385                 /*
 386                  * Do not deactivate pages from the kernel object: they
 387                  * are not intended to become pageable.
 388                  * or we've disabled the deactivate behind mechanism
 389                  */
 390                 return FALSE;
 391         }
 392         if ((sequential_run = object->sequential)) {
 393                   if (sequential_run < 0) {
 394                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 395                           sequential_run = 0 - sequential_run;
 396                   } else {
 397                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 398                   }
 399         }
 400         switch (behavior) {
 401         case VM_BEHAVIOR_RANDOM:
 402                 break;
 403         case VM_BEHAVIOR_SEQUENTIAL:
 404                 if (sequential_run >= (int)PAGE_SIZE)
 405                         m = vm_page_lookup(object, offset - PAGE_SIZE_64);
 406                 break;
 407         case VM_BEHAVIOR_RSEQNTL:
 408                 if (sequential_run >= (int)PAGE_SIZE)
 409                         m = vm_page_lookup(object, offset + PAGE_SIZE_64);
 410                 break;
 411         case VM_BEHAVIOR_DEFAULT:
 412         default:
 413         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 414
 415                 /*
 416                  * determine if the run of sequential accesss has been
 417                  * long enough on an object with default access behavior
 418                  * to consider it for deactivation
 419                  */
 420                 if ((uint64_t)sequential_run >= behind) {
 421                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 422                                 if (offset >= behind)
 423                                         m = vm_page_lookup(object, offset - behind);
 424                         } else {
 425                                 if (offset < -behind)
 426                                         m = vm_page_lookup(object, offset + behind);
 427                         }
 428                 }
 429                 break;
 430         }
 431         }
 432         if (m) {
 433                 if (!m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
 434                         pmap_clear_reference(m->phys_page);
 435                         m->deactivated = TRUE;
 436 #if TRACEFAULTPAGE
 437                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 438 #endif
 439                         return TRUE;
 440                 }
 441         }
 442         return FALSE;
 443 }
 444
 445
 446 /*
 447  * check for various conditions that would
 448  * prevent us from creating a ZF page...
 449  * cleanup is based on being called from vm_fault_page
 450  *
 451  * object must be locked
 452  * object == m->object
 453  */
 454 static vm_fault_return_t
 455 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
 456 {
 457         if (object->shadow_severed) {
 458                 /*
 459                  * the shadow chain was severed
 460                  * just have to return an error at this point
 461                  */
 462                 if (m != VM_PAGE_NULL)
 463                         VM_PAGE_FREE(m);
 464                 vm_fault_cleanup(object, first_m);
 465
 466                 thread_interrupt_level(interruptible_state);
 467
 468                 return (VM_FAULT_MEMORY_ERROR);
 469         }
 470         if (vm_backing_store_low) {
 471                 /*
 472                  * are we protecting the system from
 473                  * backing store exhaustion.  If so
 474                  * sleep unless we are privileged.
 475                  */
 476                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 477
 478                         if (m != VM_PAGE_NULL)
 479                                 VM_PAGE_FREE(m);
 480                         vm_fault_cleanup(object, first_m);
 481
 482                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 483
 484                         thread_block(THREAD_CONTINUE_NULL);
 485                         thread_interrupt_level(interruptible_state);
 486
 487                         return (VM_FAULT_RETRY);
 488                 }
 489         }
 490         if (VM_PAGE_ZFILL_THROTTLED()) {
 491                 /*
 492                  * we're throttling zero-fills...
 493                  * treat this as if we couldn't grab a page
 494                  */
 495                 if (m != VM_PAGE_NULL)
 496                         VM_PAGE_FREE(m);
 497                 vm_fault_cleanup(object, first_m);
 498
 499                 thread_interrupt_level(interruptible_state);
 500
 501                 return (VM_FAULT_MEMORY_SHORTAGE);
 502         }
 503         return (VM_FAULT_SUCCESS);
 504 }
 505
 506
 507 /*
 508  * do the work to zero fill a page and
 509  * inject it into the correct paging queue
 510  *
 511  * m->object must be locked
 512  * page queue lock must NOT be held
 513  */
 514 static int
 515 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 516 {
 517         int my_fault = DBG_ZERO_FILL_FAULT;
 518
 519         /*
 520          * This is is a zero-fill page fault...
 521          *
 522          * Checking the page lock is a waste of
 523          * time;  this page was absent, so
 524          * it can't be page locked by a pager.
 525          *
 526          * we also consider it undefined
 527          * with respect to instruction
 528          * execution.  i.e. it is the responsibility
 529          * of higher layers to call for an instruction
 530          * sync after changing the contents and before
 531          * sending a program into this area.  We
 532          * choose this approach for performance
 533          */
 534         m->pmapped = TRUE;
 535
 536         m->cs_validated = FALSE;
 537         m->cs_tainted = FALSE;
 538
 539         if (no_zero_fill == TRUE)
 540                 my_fault = DBG_NZF_PAGE_FAULT;
 541         else {
 542                 vm_page_zero_fill(m);
 543
 544                 VM_STAT_INCR(zero_fill_count);
 545                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 546         }
 547         assert(!m->laundry);
 548         assert(m->object != kernel_object);
 549         //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
 550
 551         if (!IP_VALID(memory_manager_default) &&
 552                 (m->object->purgable == VM_PURGABLE_DENY ||
 553                  m->object->purgable == VM_PURGABLE_NONVOLATILE ||
 554                  m->object->purgable == VM_PURGABLE_VOLATILE )) {
 555                 vm_page_lock_queues();
 556
 557                 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 558                 m->throttled = TRUE;
 559                 vm_page_throttled_count++;
 560
 561                 vm_page_unlock_queues();
 562         } else {
 563                 if (m->object->size > VM_ZF_OBJECT_SIZE_THRESHOLD) {
 564                         m->zero_fill = TRUE;
 565                         OSAddAtomic(1, (SInt32 *)&vm_zf_count);
 566                 }
 567         }
 568         return (my_fault);
 569 }
 570
 571
 572 /*
 573  *      Routine:        vm_fault_page
 574  *      Purpose:
 575  *              Find the resident page for the virtual memory
 576  *              specified by the given virtual memory object
 577  *              and offset.
 578  *      Additional arguments:
 579  *              The required permissions for the page is given
 580  *              in "fault_type".  Desired permissions are included
 581  *              in "protection".
 582  *              fault_info is passed along to determine pagein cluster
 583  *              limits... it contains the expected reference pattern,
 584  *              cluster size if available, etc...
 585  *
 586  *              If the desired page is known to be resident (for
 587  *              example, because it was previously wired down), asserting
 588  *              the "unwiring" parameter will speed the search.
 589  *
 590  *              If the operation can be interrupted (by thread_abort
 591  *              or thread_terminate), then the "interruptible"
 592  *              parameter should be asserted.
 593  *
 594  *      Results:
 595  *              The page containing the proper data is returned
 596  *              in "result_page".
 597  *
 598  *      In/out conditions:
 599  *              The source object must be locked and referenced,
 600  *              and must donate one paging reference.  The reference
 601  *              is not affected.  The paging reference and lock are
 602  *              consumed.
 603  *
 604  *              If the call succeeds, the object in which "result_page"
 605  *              resides is left locked and holding a paging reference.
 606  *              If this is not the original object, a busy page in the
 607  *              original object is returned in "top_page", to prevent other
 608  *              callers from pursuing this same data, along with a paging
 609  *              reference for the original object.  The "top_page" should
 610  *              be destroyed when this guarantee is no longer required.
 611  *              The "result_page" is also left busy.  It is not removed
 612  *              from the pageout queues.
 613  */
 614
 615 vm_fault_return_t
 616 vm_fault_page(
 617         /* Arguments: */
 618         vm_object_t     first_object,   /* Object to begin search */
 619         vm_object_offset_t first_offset,        /* Offset into object */
 620         vm_prot_t       fault_type,     /* What access is requested */
 621         boolean_t       must_be_resident,/* Must page be resident? */
 622         /* Modifies in place: */
 623         vm_prot_t       *protection,    /* Protection for mapping */
 624         /* Returns: */
 625         vm_page_t       *result_page,   /* Page found, if successful */
 626         vm_page_t       *top_page,      /* Page in top object, if
 627                                          * not result_page.  */
 628         int             *type_of_fault, /* if non-null, fill in with type of fault
 629                                          * COW, zero-fill, etc... returned in trace point */
 630         /* More arguments: */
 631         kern_return_t   *error_code,    /* code if page is in error */
 632         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 633 #if MACH_PAGEMAP
 634         boolean_t       data_supply,    /* treat as data_supply if
 635                                          * it is a write fault and a full
 636                                          * page is provided */
 637 #else
 638         __unused boolean_t data_supply,
 639 #endif
 640         vm_object_fault_info_t fault_info)
 641 {
 642         vm_page_t               m;
 643         vm_object_t             object;
 644         vm_object_offset_t      offset;
 645         vm_page_t               first_m;
 646         vm_object_t             next_object;
 647         vm_object_t             copy_object;
 648         boolean_t               look_for_page;
 649         vm_prot_t               access_required = fault_type;
 650         vm_prot_t               wants_copy_flag;
 651         CLUSTER_STAT(int pages_at_higher_offsets;)
 652         CLUSTER_STAT(int pages_at_lower_offsets;)
 653         kern_return_t           wait_result;
 654         boolean_t               interruptible_state;
 655         vm_fault_return_t       error;
 656         int                     my_fault;
 657         uint32_t                try_failed_count;
 658         int                     interruptible; /* how may fault be interrupted? */
 659         memory_object_t         pager;
 660
 661 /*
 662  * MACH page map - an optional optimization where a bit map is maintained
 663  * by the VM subsystem for internal objects to indicate which pages of
 664  * the object currently reside on backing store.  This existence map
 665  * duplicates information maintained by the vnode pager.  It is
 666  * created at the time of the first pageout against the object, i.e.
 667  * at the same time pager for the object is created.  The optimization
 668  * is designed to eliminate pager interaction overhead, if it is
 669  * 'known' that the page does not exist on backing store.
 670  *
 671  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 672  * either marked as paged out in the existence map for the object or no
 673  * existence map exists for the object.  MUST_ASK_PAGER() is one of the
 674  * criteria in the decision to invoke the pager.   It is also used as one
 675  * of the criteria to terminate the scan for adjacent pages in a clustered
 676  * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
 677  * permanent objects.  Note also that if the pager for an internal object
 678  * has not been created, the pager is not invoked regardless of the value
 679  * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
 680  * for which a pager has been created.
 681  *
 682  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 683  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 684  * PAGED_OUT() is used to determine if a page has already been pushed
 685  * into a copy object in order to avoid a redundant page out operation.
 686  */
 687 #if MACH_PAGEMAP
 688 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 689                         != VM_EXTERNAL_STATE_ABSENT)
 690 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 691                         == VM_EXTERNAL_STATE_EXISTS)
 692 #else
 693 #define MUST_ASK_PAGER(o, f) (TRUE)
 694 #define PAGED_OUT(o, f) (FALSE)
 695 #endif
 696
 697 /*
 698  *      Recovery actions
 699  */
 700 #define PREPARE_RELEASE_PAGE(m)                         \
 701         MACRO_BEGIN                                     \
 702         vm_page_lock_queues();                          \
 703         MACRO_END
 704
 705 #define DO_RELEASE_PAGE(m)                              \
 706         MACRO_BEGIN                                     \
 707         PAGE_WAKEUP_DONE(m);                            \
 708         if (!m->active && !m->inactive && !m->throttled)\
 709                 vm_page_activate(m);                    \
 710         vm_page_unlock_queues();                        \
 711         MACRO_END
 712
 713 #define RELEASE_PAGE(m)                                 \
 714         MACRO_BEGIN                                     \
 715         PREPARE_RELEASE_PAGE(m);                        \
 716         DO_RELEASE_PAGE(m);                             \
 717         MACRO_END
 718
 719 #if TRACEFAULTPAGE
 720         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 721 #endif
 722
 723
 724 #if     MACH_KDB
 725                 /*
 726                  *      If there are watchpoints set, then
 727                  *      we don't want to give away write permission
 728                  *      on a read fault.  Make the task write fault,
 729                  *      so that the watchpoint code notices the access.
 730                  */
 731             if (db_watchpoint_list) {
 732                 /*
 733                  *      If we aren't asking for write permission,
 734                  *      then don't give it away.  We're using write
 735                  *      faults to set the dirty bit.
 736                  */
 737                 if (!(fault_type & VM_PROT_WRITE))
 738                         *protection &= ~VM_PROT_WRITE;
 739         }
 740 #endif  /* MACH_KDB */
 741
 742         interruptible = fault_info->interruptible;
 743         interruptible_state = thread_interrupt_level(interruptible);
 744
 745         /*
 746          *      INVARIANTS (through entire routine):
 747          *
 748          *      1)      At all times, we must either have the object
 749          *              lock or a busy page in some object to prevent
 750          *              some other thread from trying to bring in
 751          *              the same page.
 752          *
 753          *              Note that we cannot hold any locks during the
 754          *              pager access or when waiting for memory, so
 755          *              we use a busy page then.
 756          *
 757          *      2)      To prevent another thread from racing us down the
 758          *              shadow chain and entering a new page in the top
 759          *              object before we do, we must keep a busy page in
 760          *              the top object while following the shadow chain.
 761          *
 762          *      3)      We must increment paging_in_progress on any object
 763          *              for which we have a busy page before dropping
 764          *              the object lock
 765          *
 766          *      4)      We leave busy pages on the pageout queues.
 767          *              If the pageout daemon comes across a busy page,
 768          *              it will remove the page from the pageout queues.
 769          */
 770
 771         object = first_object;
 772         offset = first_offset;
 773         first_m = VM_PAGE_NULL;
 774         access_required = fault_type;
 775
 776
 777         XPR(XPR_VM_FAULT,
 778                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 779                 (integer_t)object, offset, fault_type, *protection, 0);
 780
 781         /*
 782          * default type of fault
 783          */
 784         my_fault = DBG_CACHE_HIT_FAULT;
 785
 786         while (TRUE) {
 787 #if TRACEFAULTPAGE
 788                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 789 #endif
 790                 if (!object->alive) {
 791                         /*
 792                          * object is no longer valid
 793                          * clean up and return error
 794                          */
 795                         vm_fault_cleanup(object, first_m);
 796                         thread_interrupt_level(interruptible_state);
 797
 798                         return (VM_FAULT_MEMORY_ERROR);
 799                 }
 800
 801                 /*
 802                  * See whether the page at 'offset' is resident
 803                  */
 804                 m = vm_page_lookup(object, offset);
 805 #if TRACEFAULTPAGE
 806                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 807 #endif
 808                 if (m != VM_PAGE_NULL) {
 809
 810                         if (m->busy) {
 811                                 /*
 812                                  * The page is being brought in,
 813                                  * wait for it and then retry.
 814                                  *
 815                                  * A possible optimization: if the page
 816                                  * is known to be resident, we can ignore
 817                                  * pages that are absent (regardless of
 818                                  * whether they're busy).
 819                                  */
 820 #if TRACEFAULTPAGE
 821                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 822 #endif
 823                                 wait_result = PAGE_SLEEP(object, m, interruptible);
 824                                 XPR(XPR_VM_FAULT,
 825                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 826                                         (integer_t)object, offset,
 827                                         (integer_t)m, 0, 0);
 828                                 counter(c_vm_fault_page_block_busy_kernel++);
 829
 830                                 if (wait_result != THREAD_AWAKENED) {
 831                                         vm_fault_cleanup(object, first_m);
 832                                         thread_interrupt_level(interruptible_state);
 833
 834                                         if (wait_result == THREAD_RESTART)
 835                                                 return (VM_FAULT_RETRY);
 836                                         else
 837                                                 return (VM_FAULT_INTERRUPTED);
 838                                 }
 839                                 continue;
 840                         }
 841
 842                         if (m->phys_page == vm_page_guard_addr) {
 843                                 /*
 844                                  * Guard page: off limits !
 845                                  */
 846                                 if (fault_type == VM_PROT_NONE) {
 847                                         /*
 848                                          * The fault is not requesting any
 849                                          * access to the guard page, so it must
 850                                          * be just to wire or unwire it.
 851                                          * Let's pretend it succeeded...
 852                                          */
 853                                         m->busy = TRUE;
 854                                         *result_page = m;
 855                                         assert(first_m == VM_PAGE_NULL);
 856                                         *top_page = first_m;
 857                                         if (type_of_fault)
 858                                                 *type_of_fault = DBG_GUARD_FAULT;
 859                                         return VM_FAULT_SUCCESS;
 860                                 } else {
 861                                         /*
 862                                          * The fault requests access to the
 863                                          * guard page: let's deny that !
 864                                          */
 865                                         vm_fault_cleanup(object, first_m);
 866                                         thread_interrupt_level(interruptible_state);
 867                                         return VM_FAULT_MEMORY_ERROR;
 868                                 }
 869                         }
 870
 871                         if (m->error) {
 872                                 /*
 873                                  * The page is in error, give up now.
 874                                  */
 875 #if TRACEFAULTPAGE
 876                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
 877 #endif
 878                                 if (error_code)
 879                                         *error_code = KERN_MEMORY_ERROR;
 880                                 VM_PAGE_FREE(m);
 881
 882                                 vm_fault_cleanup(object, first_m);
 883                                 thread_interrupt_level(interruptible_state);
 884
 885                                 return (VM_FAULT_MEMORY_ERROR);
 886                         }
 887                         if (m->restart) {
 888                                 /*
 889                                  * The pager wants us to restart
 890                                  * at the top of the chain,
 891                                  * typically because it has moved the
 892                                  * page to another pager, then do so.
 893                                  */
 894 #if TRACEFAULTPAGE
 895                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 896 #endif
 897                                 VM_PAGE_FREE(m);
 898
 899                                 vm_fault_cleanup(object, first_m);
 900                                 thread_interrupt_level(interruptible_state);
 901
 902                                 return (VM_FAULT_RETRY);
 903                         }
 904                         if (m->absent) {
 905                                 /*
 906                                  * The page isn't busy, but is absent,
 907                                  * therefore it's deemed "unavailable".
 908                                  *
 909                                  * Remove the non-existent page (unless it's
 910                                  * in the top object) and move on down to the
 911                                  * next object (if there is one).
 912                                  */
 913 #if TRACEFAULTPAGE
 914                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
 915 #endif
 916                                 next_object = object->shadow;
 917
 918                                 if (next_object == VM_OBJECT_NULL) {
 919                                         /*
 920                                          * Absent page at bottom of shadow
 921                                          * chain; zero fill the page we left
 922                                          * busy in the first object, and free
 923                                          * the absent page.
 924                                          */
 925                                         assert(!must_be_resident);
 926
 927                                         /*
 928                                          * check for any conditions that prevent
 929                                          * us from creating a new zero-fill page
 930                                          * vm_fault_check will do all of the
 931                                          * fault cleanup in the case of an error condition
 932                                          * including resetting the thread_interrupt_level
 933                                          */
 934                                         error = vm_fault_check(object, m, first_m, interruptible_state);
 935
 936                                         if (error != VM_FAULT_SUCCESS)
 937                                                 return (error);
 938
 939                                         XPR(XPR_VM_FAULT,
 940                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
 941                                                 (integer_t)object, offset,
 942                                                 (integer_t)m,
 943                                                 (integer_t)first_object, 0);
 944
 945                                         if (object != first_object) {
 946                                                 /*
 947                                                  * free the absent page we just found
 948                                                  */
 949                                                 VM_PAGE_FREE(m);
 950
 951                                                 /*
 952                                                  * drop reference and lock on current object
 953                                                  */
 954                                                 vm_object_paging_end(object);
 955                                                 vm_object_unlock(object);
 956
 957                                                 /*
 958                                                  * grab the original page we
 959                                                  * 'soldered' in place and
 960                                                  * retake lock on 'first_object'
 961                                                  */
 962                                                 m = first_m;
 963                                                 first_m = VM_PAGE_NULL;
 964
 965                                                 object = first_object;
 966                                                 offset = first_offset;
 967
 968                                                 vm_object_lock(object);
 969                                         } else {
 970                                                 /*
 971                                                  * we're going to use the absent page we just found
 972                                                  * so convert it to a 'busy' page
 973                                                  */
 974                                                 m->absent = FALSE;
 975                                                 m->busy = TRUE;
 976                                         }
 977                                         /*
 978                                          * zero-fill the page and put it on
 979                                          * the correct paging queue
 980                                          */
 981                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
 982
 983                                         break;
 984                                 } else {
 985                                         if (must_be_resident)
 986                                                 vm_object_paging_end(object);
 987                                         else if (object != first_object) {
 988                                                 vm_object_paging_end(object);
 989                                                 VM_PAGE_FREE(m);
 990                                         } else {
 991                                                 first_m = m;
 992                                                 m->absent = FALSE;
 993                                                 m->busy = TRUE;
 994
 995                                                 vm_page_lockspin_queues();
 996                                                 VM_PAGE_QUEUES_REMOVE(m);
 997                                                 vm_page_unlock_queues();
 998                                         }
 999                                         XPR(XPR_VM_FAULT,
1000                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1001                                                 (integer_t)object, offset,
1002                                                 (integer_t)next_object,
1003                                                 offset+object->shadow_offset,0);
1004
1005                                         offset += object->shadow_offset;
1006                                         fault_info->lo_offset += object->shadow_offset;
1007                                         fault_info->hi_offset += object->shadow_offset;
1008                                         access_required = VM_PROT_READ;
1009
1010                                         vm_object_lock(next_object);
1011                                         vm_object_unlock(object);
1012                                         object = next_object;
1013                                         vm_object_paging_begin(object);
1014
1015                                         /*
1016                                          * reset to default type of fault
1017                                          */
1018                                         my_fault = DBG_CACHE_HIT_FAULT;
1019
1020                                         continue;
1021                                 }
1022                         }
1023                         if ((m->cleaning)
1024                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1025                             && (fault_type & VM_PROT_WRITE)) {
1026                                 /*
1027                                  * This is a copy-on-write fault that will
1028                                  * cause us to revoke access to this page, but
1029                                  * this page is in the process of being cleaned
1030                                  * in a clustered pageout. We must wait until
1031                                  * the cleaning operation completes before
1032                                  * revoking access to the original page,
1033                                  * otherwise we might attempt to remove a
1034                                  * wired mapping.
1035                                  */
1036 #if TRACEFAULTPAGE
1037                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1038 #endif
1039                                 XPR(XPR_VM_FAULT,
1040                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1041                                         (integer_t)object, offset,
1042                                         (integer_t)m, 0, 0);
1043                                 /*
1044                                  * take an extra ref so that object won't die
1045                                  */
1046                                 vm_object_reference_locked(object);
1047
1048                                 vm_fault_cleanup(object, first_m);
1049
1050                                 counter(c_vm_fault_page_block_backoff_kernel++);
1051                                 vm_object_lock(object);
1052                                 assert(object->ref_count > 0);
1053
1054                                 m = vm_page_lookup(object, offset);
1055
1056                                 if (m != VM_PAGE_NULL && m->cleaning) {
1057                                         PAGE_ASSERT_WAIT(m, interruptible);
1058
1059                                         vm_object_unlock(object);
1060                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1061                                         vm_object_deallocate(object);
1062
1063                                         goto backoff;
1064                                 } else {
1065                                         vm_object_unlock(object);
1066
1067                                         vm_object_deallocate(object);
1068                                         thread_interrupt_level(interruptible_state);
1069
1070                                         return (VM_FAULT_RETRY);
1071                                 }
1072                         }
1073                         if (type_of_fault == NULL && m->speculative) {
1074                                 /*
1075                                  * If we were passed a non-NULL pointer for
1076                                  * "type_of_fault", than we came from
1077                                  * vm_fault... we'll let it deal with
1078                                  * this condition, since it
1079                                  * needs to see m->speculative to correctly
1080                                  * account the pageins, otherwise...
1081                                  * take it off the speculative queue, we'll
1082                                  * let the caller of vm_fault_page deal
1083                                  * with getting it onto the correct queue
1084                                  */
1085                                 vm_page_lockspin_queues();
1086                                 VM_PAGE_QUEUES_REMOVE(m);
1087                                 vm_page_unlock_queues();
1088                         }
1089
1090                         if (m->encrypted) {
1091                                 /*
1092                                  * ENCRYPTED SWAP:
1093                                  * the user needs access to a page that we
1094                                  * encrypted before paging it out.
1095                                  * Decrypt the page now.
1096                                  * Keep it busy to prevent anyone from
1097                                  * accessing it during the decryption.
1098                                  */
1099                                 m->busy = TRUE;
1100                                 vm_page_decrypt(m, 0);
1101                                 assert(object == m->object);
1102                                 assert(m->busy);
1103                                 PAGE_WAKEUP_DONE(m);
1104
1105                                 /*
1106                                  * Retry from the top, in case
1107                                  * something changed while we were
1108                                  * decrypting.
1109                                  */
1110                                 continue;
1111                         }
1112                         ASSERT_PAGE_DECRYPTED(m);
1113
1114                         if (m->object->code_signed) {
1115                                 /*
1116                                  * CODE SIGNING:
1117                                  * We just paged in a page from a signed
1118                                  * memory object but we don't need to
1119                                  * validate it now.  We'll validate it if
1120                                  * when it gets mapped into a user address
1121                                  * space for the first time or when the page
1122                                  * gets copied to another object as a result
1123                                  * of a copy-on-write.
1124                                  */
1125                         }
1126
1127                         /*
1128                          * We mark the page busy and leave it on
1129                          * the pageout queues.  If the pageout
1130                          * deamon comes across it, then it will
1131                          * remove the page from the queue, but not the object
1132                          */
1133 #if TRACEFAULTPAGE
1134                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1135 #endif
1136                         XPR(XPR_VM_FAULT,
1137                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1138                                 (integer_t)object, offset, (integer_t)m, 0, 0);
1139                         assert(!m->busy);
1140                         assert(!m->absent);
1141
1142                         m->busy = TRUE;
1143                         break;
1144                 }
1145
1146
1147                 /*
1148                  * we get here when there is no page present in the object at
1149                  * the offset we're interested in... we'll allocate a page
1150                  * at this point if the pager associated with
1151                  * this object can provide the data or we're the top object...
1152                  * object is locked;  m == NULL
1153                  */
1154                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1155
1156 #if TRACEFAULTPAGE
1157                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1158 #endif
1159                 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) {
1160                         /*
1161                          * Allocate a new page for this object/offset pair
1162                          */
1163                         m = vm_page_grab();
1164 #if TRACEFAULTPAGE
1165                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1166 #endif
1167                         if (m == VM_PAGE_NULL) {
1168
1169                                 vm_fault_cleanup(object, first_m);
1170                                 thread_interrupt_level(interruptible_state);
1171
1172                                 return (VM_FAULT_MEMORY_SHORTAGE);
1173                         }
1174                         vm_page_insert(m, object, offset);
1175                 }
1176                 if (look_for_page && !must_be_resident) {
1177                         kern_return_t   rc;
1178
1179                         /*
1180                          *      If the memory manager is not ready, we
1181                          *      cannot make requests.
1182                          */
1183                         if (!object->pager_ready) {
1184 #if TRACEFAULTPAGE
1185                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1186 #endif
1187                                 if (m != VM_PAGE_NULL)
1188                                         VM_PAGE_FREE(m);
1189
1190                                 XPR(XPR_VM_FAULT,
1191                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1192                                         (integer_t)object, offset, 0, 0, 0);
1193
1194                                 /*
1195                                  * take an extra ref so object won't die
1196                                  */
1197                                 vm_object_reference_locked(object);
1198                                 vm_fault_cleanup(object, first_m);
1199                                 counter(c_vm_fault_page_block_backoff_kernel++);
1200
1201                                 vm_object_lock(object);
1202                                 assert(object->ref_count > 0);
1203
1204                                 if (!object->pager_ready) {
1205                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1206
1207                                         vm_object_unlock(object);
1208                                         if (wait_result == THREAD_WAITING)
1209                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1210                                         vm_object_deallocate(object);
1211
1212                                         goto backoff;
1213                                 } else {
1214                                         vm_object_unlock(object);
1215                                         vm_object_deallocate(object);
1216                                         thread_interrupt_level(interruptible_state);
1217
1218                                         return (VM_FAULT_RETRY);
1219                                 }
1220                         }
1221                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1222                                 /*
1223                                  * If there are too many outstanding page
1224                                  * requests pending on this external object, we
1225                                  * wait for them to be resolved now.
1226                                  */
1227 #if TRACEFAULTPAGE
1228                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1229 #endif
1230                                 if (m != VM_PAGE_NULL)
1231                                         VM_PAGE_FREE(m);
1232                                 /*
1233                                  * take an extra ref so object won't die
1234                                  */
1235                                 vm_object_reference_locked(object);
1236
1237                                 vm_fault_cleanup(object, first_m);
1238
1239                                 counter(c_vm_fault_page_block_backoff_kernel++);
1240
1241                                 vm_object_lock(object);
1242                                 assert(object->ref_count > 0);
1243
1244                                 if (object->paging_in_progress > vm_object_pagein_throttle) {
1245                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible);
1246
1247                                         vm_object_unlock(object);
1248                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1249                                         vm_object_deallocate(object);
1250
1251                                         goto backoff;
1252                                 } else {
1253                                         vm_object_unlock(object);
1254                                         vm_object_deallocate(object);
1255                                         thread_interrupt_level(interruptible_state);
1256
1257                                         return (VM_FAULT_RETRY);
1258                                 }
1259                         }
1260                         if (m != VM_PAGE_NULL) {
1261                                 /*
1262                                  * Indicate that the page is waiting for data
1263                                  * from the memory manager.
1264                                  */
1265                                 m->list_req_pending = TRUE;
1266                                 m->absent = TRUE;
1267                         }
1268
1269 #if TRACEFAULTPAGE
1270                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1271 #endif
1272
1273                         /*
1274                          * It's possible someone called vm_object_destroy while we weren't
1275                          * holding the object lock.  If that has happened, then bail out
1276                          * here.
1277                          */
1278
1279                         pager = object->pager;
1280
1281                         if (pager == MEMORY_OBJECT_NULL) {
1282                                 vm_fault_cleanup(object, first_m);
1283                                 thread_interrupt_level(interruptible_state);
1284                                 return VM_FAULT_MEMORY_ERROR;
1285                         }
1286
1287                         /*
1288                          * We have an absent page in place for the faulting offset,
1289                          * so we can release the object lock.
1290                          */
1291
1292                         vm_object_unlock(object);
1293
1294                         /*
1295                          * If this object uses a copy_call strategy,
1296                          * and we are interested in a copy of this object
1297                          * (having gotten here only by following a
1298                          * shadow chain), then tell the memory manager
1299                          * via a flag added to the desired_access
1300                          * parameter, so that it can detect a race
1301                          * between our walking down the shadow chain
1302                          * and its pushing pages up into a copy of
1303                          * the object that it manages.
1304                          */
1305                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1306                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1307                         else
1308                                 wants_copy_flag = VM_PROT_NONE;
1309
1310                         XPR(XPR_VM_FAULT,
1311                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1312                                 (integer_t)object, offset, (integer_t)m,
1313                                 access_required | wants_copy_flag, 0);
1314
1315                         /*
1316                          * Call the memory manager to retrieve the data.
1317                          */
1318                         rc = memory_object_data_request(
1319                                 pager,
1320                                 offset + object->paging_offset,
1321                                 PAGE_SIZE,
1322                                 access_required | wants_copy_flag,
1323                                 (memory_object_fault_info_t)fault_info);
1324
1325 #if TRACEFAULTPAGE
1326                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1327 #endif
1328                         vm_object_lock(object);
1329
1330                         if (rc != KERN_SUCCESS) {
1331
1332                                 vm_fault_cleanup(object, first_m);
1333                                 thread_interrupt_level(interruptible_state);
1334
1335                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1336                                         VM_FAULT_INTERRUPTED :
1337                                         VM_FAULT_MEMORY_ERROR);
1338                         }
1339                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) {
1340
1341                                 vm_fault_cleanup(object, first_m);
1342                                 thread_interrupt_level(interruptible_state);
1343
1344                                 return (VM_FAULT_INTERRUPTED);
1345                         }
1346                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1347                                 /*
1348                                  * No page here means that the object we
1349                                  * initially looked up was "physically
1350                                  * contiguous" (i.e. device memory).  However,
1351                                  * with Virtual VRAM, the object might not
1352                                  * be backed by that device memory anymore,
1353                                  * so we're done here only if the object is
1354                                  * still "phys_contiguous".
1355                                  * Otherwise, if the object is no longer
1356                                  * "phys_contiguous", we need to retry the
1357                                  * page fault against the object's new backing
1358                                  * store (different memory object).
1359                                  */
1360                                 break;
1361                         }
1362                         /*
1363                          * potentially a pagein fault
1364                          * if we make it through the state checks
1365                          * above, than we'll count it as such
1366                          */
1367                         my_fault = DBG_PAGEIN_FAULT;
1368
1369                         /*
1370                          * Retry with same object/offset, since new data may
1371                          * be in a different page (i.e., m is meaningless at
1372                          * this point).
1373                          */
1374                         continue;
1375                 }
1376
1377                 /*
1378                  * We get here if the object has no pager, or an existence map
1379                  * exists and indicates the page isn't present on the pager
1380                  * or we're unwiring a page.  If a pager exists, but there
1381                  * is no existence map, then the m->absent case above handles
1382                  * the ZF case when the pager can't provide the page
1383                  */
1384 #if TRACEFAULTPAGE
1385                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1386 #endif
1387                 if (object == first_object)
1388                         first_m = m;
1389                 else
1390                         assert(m == VM_PAGE_NULL);
1391
1392                 XPR(XPR_VM_FAULT,
1393                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1394                         (integer_t)object, offset, (integer_t)m,
1395                         (integer_t)object->shadow, 0);
1396
1397                 next_object = object->shadow;
1398
1399                 if (next_object == VM_OBJECT_NULL) {
1400                         /*
1401                          * we've hit the bottom of the shadown chain,
1402                          * fill the page in the top object with zeros.
1403                          */
1404                         assert(!must_be_resident);
1405
1406                         if (object != first_object) {
1407                                 vm_object_paging_end(object);
1408                                 vm_object_unlock(object);
1409
1410                                 object = first_object;
1411                                 offset = first_offset;
1412                                 vm_object_lock(object);
1413                         }
1414                         m = first_m;
1415                         assert(m->object == object);
1416                         first_m = VM_PAGE_NULL;
1417
1418                         /*
1419                          * check for any conditions that prevent
1420                          * us from creating a new zero-fill page
1421                          * vm_fault_check will do all of the
1422                          * fault cleanup in the case of an error condition
1423                          * including resetting the thread_interrupt_level
1424                          */
1425                         error = vm_fault_check(object, m, first_m, interruptible_state);
1426
1427                         if (error != VM_FAULT_SUCCESS)
1428                                 return (error);
1429
1430                         if (m == VM_PAGE_NULL) {
1431                                 m = vm_page_grab();
1432
1433                                 if (m == VM_PAGE_NULL) {
1434                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1435                                         thread_interrupt_level(interruptible_state);
1436
1437                                         return (VM_FAULT_MEMORY_SHORTAGE);
1438                                 }
1439                                 vm_page_insert(m, object, offset);
1440                         }
1441                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1442
1443                         break;
1444
1445                 } else {
1446                         /*
1447                          * Move on to the next object.  Lock the next
1448                          * object before unlocking the current one.
1449                          */
1450                         if ((object != first_object) || must_be_resident)
1451                                 vm_object_paging_end(object);
1452
1453                         offset += object->shadow_offset;
1454                         fault_info->lo_offset += object->shadow_offset;
1455                         fault_info->hi_offset += object->shadow_offset;
1456                         access_required = VM_PROT_READ;
1457
1458                         vm_object_lock(next_object);
1459                         vm_object_unlock(object);
1460
1461                         object = next_object;
1462                         vm_object_paging_begin(object);
1463                 }
1464         }
1465
1466         /*
1467          *      PAGE HAS BEEN FOUND.
1468          *
1469          *      This page (m) is:
1470          *              busy, so that we can play with it;
1471          *              not absent, so that nobody else will fill it;
1472          *              possibly eligible for pageout;
1473          *
1474          *      The top-level page (first_m) is:
1475          *              VM_PAGE_NULL if the page was found in the
1476          *               top-level object;
1477          *              busy, not absent, and ineligible for pageout.
1478          *
1479          *      The current object (object) is locked.  A paging
1480          *      reference is held for the current and top-level
1481          *      objects.
1482          */
1483
1484 #if TRACEFAULTPAGE
1485         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1486 #endif
1487 #if     EXTRA_ASSERTIONS
1488         if (m != VM_PAGE_NULL) {
1489                 assert(m->busy && !m->absent);
1490                 assert((first_m == VM_PAGE_NULL) ||
1491                         (first_m->busy && !first_m->absent &&
1492                          !first_m->active && !first_m->inactive));
1493         }
1494 #endif  /* EXTRA_ASSERTIONS */
1495
1496         /*
1497          * ENCRYPTED SWAP:
1498          * If we found a page, we must have decrypted it before we
1499          * get here...
1500          */
1501         if (m != VM_PAGE_NULL) {
1502                 ASSERT_PAGE_DECRYPTED(m);
1503         }
1504
1505         XPR(XPR_VM_FAULT,
1506             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1507                 (integer_t)object, offset, (integer_t)m,
1508                 (integer_t)first_object, (integer_t)first_m);
1509
1510         /*
1511          * If the page is being written, but isn't
1512          * already owned by the top-level object,
1513          * we have to copy it into a new page owned
1514          * by the top-level object.
1515          */
1516         if ((object != first_object) && (m != VM_PAGE_NULL)) {
1517
1518 #if TRACEFAULTPAGE
1519                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1520 #endif
1521                 if (fault_type & VM_PROT_WRITE) {
1522                         vm_page_t copy_m;
1523
1524                         /*
1525                          * We only really need to copy if we
1526                          * want to write it.
1527                          */
1528                         assert(!must_be_resident);
1529
1530                         /*
1531                          * are we protecting the system from
1532                          * backing store exhaustion.  If so
1533                          * sleep unless we are privileged.
1534                          */
1535                         if (vm_backing_store_low) {
1536                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1537
1538                                         RELEASE_PAGE(m);
1539                                         vm_fault_cleanup(object, first_m);
1540
1541                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1542
1543                                         thread_block(THREAD_CONTINUE_NULL);
1544                                         thread_interrupt_level(interruptible_state);
1545
1546                                         return (VM_FAULT_RETRY);
1547                                 }
1548                         }
1549                         /*
1550                          * If we try to collapse first_object at this
1551                          * point, we may deadlock when we try to get
1552                          * the lock on an intermediate object (since we
1553                          * have the bottom object locked).  We can't
1554                          * unlock the bottom object, because the page
1555                          * we found may move (by collapse) if we do.
1556                          *
1557                          * Instead, we first copy the page.  Then, when
1558                          * we have no more use for the bottom object,
1559                          * we unlock it and try to collapse.
1560                          *
1561                          * Note that we copy the page even if we didn't
1562                          * need to... that's the breaks.
1563                          */
1564
1565                         /*
1566                          * Allocate a page for the copy
1567                          */
1568                         copy_m = vm_page_grab();
1569
1570                         if (copy_m == VM_PAGE_NULL) {
1571                                 RELEASE_PAGE(m);
1572
1573                                 vm_fault_cleanup(object, first_m);
1574                                 thread_interrupt_level(interruptible_state);
1575
1576                                 return (VM_FAULT_MEMORY_SHORTAGE);
1577                         }
1578                         XPR(XPR_VM_FAULT,
1579                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1580                                 (integer_t)object, offset,
1581                                 (integer_t)m, (integer_t)copy_m, 0);
1582
1583                         vm_page_copy(m, copy_m);
1584
1585                         /*
1586                          * If another map is truly sharing this
1587                          * page with us, we have to flush all
1588                          * uses of the original page, since we
1589                          * can't distinguish those which want the
1590                          * original from those which need the
1591                          * new copy.
1592                          *
1593                          * XXXO If we know that only one map has
1594                          * access to this page, then we could
1595                          * avoid the pmap_disconnect() call.
1596                          */
1597                         if (m->pmapped)
1598                                 pmap_disconnect(m->phys_page);
1599
1600                         assert(!m->cleaning);
1601
1602                         /*
1603                          * We no longer need the old page or object.
1604                          */
1605                         PAGE_WAKEUP_DONE(m);
1606                         vm_object_paging_end(object);
1607                         vm_object_unlock(object);
1608
1609                         my_fault = DBG_COW_FAULT;
1610                         VM_STAT_INCR(cow_faults);
1611                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1612                         current_task()->cow_faults++;
1613
1614                         object = first_object;
1615                         offset = first_offset;
1616
1617                         vm_object_lock(object);
1618                         /*
1619                          * get rid of the place holder
1620                          * page that we soldered in earlier
1621                          */
1622                         VM_PAGE_FREE(first_m);
1623                         first_m = VM_PAGE_NULL;
1624
1625                         /*
1626                          * and replace it with the
1627                          * page we just copied into
1628                          */
1629                         assert(copy_m->busy);
1630                         vm_page_insert(copy_m, object, offset);
1631                         copy_m->dirty = TRUE;
1632
1633                         m = copy_m;
1634                         /*
1635                          * Now that we've gotten the copy out of the
1636                          * way, let's try to collapse the top object.
1637                          * But we have to play ugly games with
1638                          * paging_in_progress to do that...
1639                          */
1640                         vm_object_paging_end(object);
1641                         vm_object_collapse(object, offset, TRUE);
1642                         vm_object_paging_begin(object);
1643
1644                 } else
1645                         *protection &= (~VM_PROT_WRITE);
1646         }
1647         /*
1648          * Now check whether the page needs to be pushed into the
1649          * copy object.  The use of asymmetric copy on write for
1650          * shared temporary objects means that we may do two copies to
1651          * satisfy the fault; one above to get the page from a
1652          * shadowed object, and one here to push it into the copy.
1653          */
1654         try_failed_count = 0;
1655
1656         while ((copy_object = first_object->copy) != VM_OBJECT_NULL && (m != VM_PAGE_NULL)) {
1657                 vm_object_offset_t      copy_offset;
1658                 vm_page_t               copy_m;
1659
1660 #if TRACEFAULTPAGE
1661                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1662 #endif
1663                 /*
1664                  * If the page is being written, but hasn't been
1665                  * copied to the copy-object, we have to copy it there.
1666                  */
1667                 if ((fault_type & VM_PROT_WRITE) == 0) {
1668                         *protection &= ~VM_PROT_WRITE;
1669                         break;
1670                 }
1671
1672                 /*
1673                  * If the page was guaranteed to be resident,
1674                  * we must have already performed the copy.
1675                  */
1676                 if (must_be_resident)
1677                         break;
1678
1679                 /*
1680                  * Try to get the lock on the copy_object.
1681                  */
1682                 if (!vm_object_lock_try(copy_object)) {
1683
1684                         vm_object_unlock(object);
1685                         try_failed_count++;
1686
1687                         mutex_pause(try_failed_count);  /* wait a bit */
1688                         vm_object_lock(object);
1689
1690                         continue;
1691                 }
1692                 try_failed_count = 0;
1693
1694                 /*
1695                  * Make another reference to the copy-object,
1696                  * to keep it from disappearing during the
1697                  * copy.
1698                  */
1699                 vm_object_reference_locked(copy_object);
1700
1701                 /*
1702                  * Does the page exist in the copy?
1703                  */
1704                 copy_offset = first_offset - copy_object->shadow_offset;
1705
1706                 if (copy_object->size <= copy_offset)
1707                         /*
1708                          * Copy object doesn't cover this page -- do nothing.
1709                          */
1710                         ;
1711                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1712                         /*
1713                          * Page currently exists in the copy object
1714                          */
1715                         if (copy_m->busy) {
1716                                 /*
1717                                  * If the page is being brought
1718                                  * in, wait for it and then retry.
1719                                  */
1720                                 RELEASE_PAGE(m);
1721
1722                                 /*
1723                                  * take an extra ref so object won't die
1724                                  */
1725                                 vm_object_reference_locked(copy_object);
1726                                 vm_object_unlock(copy_object);
1727                                 vm_fault_cleanup(object, first_m);
1728                                 counter(c_vm_fault_page_block_backoff_kernel++);
1729
1730                                 vm_object_lock(copy_object);
1731                                 assert(copy_object->ref_count > 0);
1732                                 VM_OBJ_RES_DECR(copy_object);
1733                                 vm_object_lock_assert_exclusive(copy_object);
1734                                 copy_object->ref_count--;
1735                                 assert(copy_object->ref_count > 0);
1736                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1737                                 /*
1738                                  * ENCRYPTED SWAP:
1739                                  * it's OK if the "copy_m" page is encrypted,
1740                                  * because we're not moving it nor handling its
1741                                  * contents.
1742                                  */
1743                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1744                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1745
1746                                         vm_object_unlock(copy_object);
1747                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1748                                         vm_object_deallocate(copy_object);
1749
1750                                         goto backoff;
1751                                 } else {
1752                                         vm_object_unlock(copy_object);
1753                                         vm_object_deallocate(copy_object);
1754                                         thread_interrupt_level(interruptible_state);
1755
1756                                         return (VM_FAULT_RETRY);
1757                                 }
1758                         }
1759                 }
1760                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1761                         /*
1762                          * If PAGED_OUT is TRUE, then the page used to exist
1763                          * in the copy-object, and has already been paged out.
1764                          * We don't need to repeat this. If PAGED_OUT is
1765                          * FALSE, then either we don't know (!pager_created,
1766                          * for example) or it hasn't been paged out.
1767                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1768                          * We must copy the page to the copy object.
1769                          */
1770
1771                         if (vm_backing_store_low) {
1772                                 /*
1773                                  * we are protecting the system from
1774                                  * backing store exhaustion.  If so
1775                                  * sleep unless we are privileged.
1776                                  */
1777                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1778                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1779
1780                                         RELEASE_PAGE(m);
1781                                         VM_OBJ_RES_DECR(copy_object);
1782                                         vm_object_lock_assert_exclusive(copy_object);
1783                                         copy_object->ref_count--;
1784                                         assert(copy_object->ref_count > 0);
1785
1786                                         vm_object_unlock(copy_object);
1787                                         vm_fault_cleanup(object, first_m);
1788                                         thread_block(THREAD_CONTINUE_NULL);
1789                                         thread_interrupt_level(interruptible_state);
1790
1791                                         return (VM_FAULT_RETRY);
1792                                 }
1793                         }
1794                         /*
1795                          * Allocate a page for the copy
1796                          */
1797                         copy_m = vm_page_alloc(copy_object, copy_offset);
1798
1799                         if (copy_m == VM_PAGE_NULL) {
1800                                 RELEASE_PAGE(m);
1801
1802                                 VM_OBJ_RES_DECR(copy_object);
1803                                 vm_object_lock_assert_exclusive(copy_object);
1804                                 copy_object->ref_count--;
1805                                 assert(copy_object->ref_count > 0);
1806
1807                                 vm_object_unlock(copy_object);
1808                                 vm_fault_cleanup(object, first_m);
1809                                 thread_interrupt_level(interruptible_state);
1810
1811                                 return (VM_FAULT_MEMORY_SHORTAGE);
1812                         }
1813                         /*
1814                          * Must copy page into copy-object.
1815                          */
1816                         vm_page_copy(m, copy_m);
1817
1818                         /*
1819                          * If the old page was in use by any users
1820                          * of the copy-object, it must be removed
1821                          * from all pmaps.  (We can't know which
1822                          * pmaps use it.)
1823                          */
1824                         if (m->pmapped)
1825                                 pmap_disconnect(m->phys_page);
1826
1827                         /*
1828                          * If there's a pager, then immediately
1829                          * page out this page, using the "initialize"
1830                          * option.  Else, we use the copy.
1831                          */
1832                         if ((!copy_object->pager_created)
1833 #if MACH_PAGEMAP
1834                             || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
1835 #endif
1836                             ) {
1837
1838                                 vm_page_lockspin_queues();
1839                                 assert(!m->cleaning);
1840                                 vm_page_activate(copy_m);
1841                                 vm_page_unlock_queues();
1842
1843                                 copy_m->dirty = TRUE;
1844                                 PAGE_WAKEUP_DONE(copy_m);
1845                         }
1846                         else {
1847                                 assert(copy_m->busy == TRUE);
1848                                 assert(!m->cleaning);
1849
1850                                 /*
1851                                  * dirty is protected by the object lock
1852                                  */
1853                                 copy_m->dirty = TRUE;
1854
1855                                 /*
1856                                  * The page is already ready for pageout:
1857                                  * not on pageout queues and busy.
1858                                  * Unlock everything except the
1859                                  * copy_object itself.
1860                                  */
1861                                 vm_object_unlock(object);
1862
1863                                 /*
1864                                  * Write the page to the copy-object,
1865                                  * flushing it from the kernel.
1866                                  */
1867                                 vm_pageout_initialize_page(copy_m);
1868
1869                                 /*
1870                                  * Since the pageout may have
1871                                  * temporarily dropped the
1872                                  * copy_object's lock, we
1873                                  * check whether we'll have
1874                                  * to deallocate the hard way.
1875                                  */
1876                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
1877                                         vm_object_unlock(copy_object);
1878                                         vm_object_deallocate(copy_object);
1879                                         vm_object_lock(object);
1880
1881                                         continue;
1882                                 }
1883                                 /*
1884                                  * Pick back up the old object's
1885                                  * lock.  [It is safe to do so,
1886                                  * since it must be deeper in the
1887                                  * object tree.]
1888                                  */
1889                                 vm_object_lock(object);
1890                         }
1891                         /*
1892                          * Because we're pushing a page upward
1893                          * in the object tree, we must restart
1894                          * any faults that are waiting here.
1895                          * [Note that this is an expansion of
1896                          * PAGE_WAKEUP that uses the THREAD_RESTART
1897                          * wait result].  Can't turn off the page's
1898                          * busy bit because we're not done with it.
1899                          */
1900                         if (m->wanted) {
1901                                 m->wanted = FALSE;
1902                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
1903                         }
1904                 }
1905                 /*
1906                  * The reference count on copy_object must be
1907                  * at least 2: one for our extra reference,
1908                  * and at least one from the outside world
1909                  * (we checked that when we last locked
1910                  * copy_object).
1911                  */
1912                 vm_object_lock_assert_exclusive(copy_object);
1913                 copy_object->ref_count--;
1914                 assert(copy_object->ref_count > 0);
1915
1916                 VM_OBJ_RES_DECR(copy_object);
1917                 vm_object_unlock(copy_object);
1918
1919                 break;
1920         }
1921         *result_page = m;
1922         *top_page = first_m;
1923
1924         XPR(XPR_VM_FAULT,
1925                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1926                 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1927
1928         if (m != VM_PAGE_NULL) {
1929                 if (my_fault == DBG_PAGEIN_FAULT) {
1930
1931                         VM_STAT_INCR(pageins);
1932                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
1933                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1934                         current_task()->pageins++;
1935
1936                         if (m->object->internal) {
1937                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
1938                         } else {
1939                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
1940                         }
1941
1942                         /*
1943                          * evaluate access pattern and update state
1944                          * vm_fault_deactivate_behind depends on the
1945                          * state being up to date
1946                          */
1947                         vm_fault_is_sequential(object, offset, fault_info->behavior);
1948
1949                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
1950                 }
1951                 if (type_of_fault)
1952                         *type_of_fault = my_fault;
1953         } else
1954                 vm_object_unlock(object);
1955
1956         thread_interrupt_level(interruptible_state);
1957
1958 #if TRACEFAULTPAGE
1959         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
1960 #endif
1961         return (VM_FAULT_SUCCESS);
1962
1963 backoff:
1964         thread_interrupt_level(interruptible_state);
1965
1966         if (wait_result == THREAD_INTERRUPTED)
1967                 return (VM_FAULT_INTERRUPTED);
1968         return (VM_FAULT_RETRY);
1969
1970 #undef  RELEASE_PAGE
1971 }
1972
1973
1974
1975 /*
1976  * CODE SIGNING:
1977  * When soft faulting a page, we have to validate the page if:
1978  * 1. the page is being mapped in user space
1979  * 2. the page hasn't already been found to be "tainted"
1980  * 3. the page belongs to a code-signed object
1981  * 4. the page has not been validated yet or has been mapped for write.
1982  */
1983 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page)                         \
1984         ((pmap) != kernel_pmap /*1*/ &&                                 \
1985          !(page)->cs_tainted /*2*/ &&                                   \
1986          (page)->object->code_signed /*3*/ &&                           \
1987          (!(page)->cs_validated || (page)->wpmapped /*4*/))
1988
1989
1990 /*
1991  * page queue lock must NOT be held
1992  * m->object must be locked
1993  *
1994  * NOTE: m->object could be locked "shared" only if we are called
1995  * from vm_fault() as part of a soft fault.  If so, we must be
1996  * careful not to modify the VM object in any way that is not
1997  * legal under a shared lock...
1998  */
1999 unsigned long cs_enter_tainted_rejected = 0;
2000 unsigned long cs_enter_tainted_accepted = 0;
2001 kern_return_t
2002 vm_fault_enter(vm_page_t m,
2003                pmap_t pmap,
2004                vm_map_offset_t vaddr,
2005                vm_prot_t prot,
2006                boolean_t wired,
2007                boolean_t change_wiring,
2008                boolean_t no_cache,
2009                int *type_of_fault)
2010 {
2011         unsigned int    cache_attr;
2012         kern_return_t   kr;
2013         boolean_t       previously_pmapped = m->pmapped;
2014
2015         vm_object_lock_assert_held(m->object);
2016 #if DEBUG
2017         mutex_assert(&vm_page_queue_lock, MA_NOTOWNED);
2018 #endif /* DEBUG */
2019
2020         if (m->phys_page == vm_page_guard_addr) {
2021                 assert(m->fictitious);
2022                 return KERN_SUCCESS;
2023         }
2024
2025         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2026
2027         if (m->pmapped == FALSE) {
2028                 /*
2029                  * This is the first time this page is being
2030                  * mapped in an address space (pmapped == FALSE).
2031                  *
2032                  * Part of that page may still be in the data cache
2033                  * and not flushed to memory.  In case we end up
2034                  * accessing that page via the instruction cache,
2035                  * we need to ensure that the 2 caches are in sync.
2036                  */
2037                 pmap_sync_page_data_phys(m->phys_page);
2038
2039                 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2040                         /*
2041                          * found it in the cache, but this
2042                          * is the first fault-in of the page (m->pmapped == FALSE)
2043                          * so it must have come in as part of
2044                          * a cluster... account 1 pagein against it
2045                          */
2046                         VM_STAT_INCR(pageins);
2047                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2048
2049                         if (m->object->internal) {
2050                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2051                         } else {
2052                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2053                         }
2054
2055                         current_task()->pageins++;
2056
2057                         *type_of_fault = DBG_PAGEIN_FAULT;
2058                 }
2059                 VM_PAGE_CONSUME_CLUSTERED(m);
2060
2061         } else if (cache_attr != VM_WIMG_DEFAULT)
2062                 pmap_sync_page_attributes_phys(m->phys_page);
2063
2064         if (*type_of_fault != DBG_COW_FAULT) {
2065                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2066
2067                 if (pmap == kernel_pmap) {
2068                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2069                 }
2070         }
2071
2072         if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2073                 vm_object_lock_assert_exclusive(m->object);
2074
2075                 if (m->cs_validated) {
2076                         vm_cs_revalidates++;
2077                 }
2078
2079                 /* VM map is locked, so 1 ref will remain on VM object */
2080                 vm_page_validate_cs(m);
2081         }
2082
2083         if (m->cs_tainted       /* always invalidate a tainted page */
2084 #if CONFIG_ENFORCE_SIGNED_CODE
2085             /*
2086              * Code Signing enforcement invalidates an executable page that
2087              * has no code directory, and thus could not be validated.
2088              */
2089             || ((prot & VM_PROT_EXECUTE) && !m->cs_validated )
2090 #endif
2091                 ) {
2092                 /*
2093                  * CODE SIGNING:
2094                  * This page has been tainted and can not be trusted.
2095                  * Let's notify the current process and let it take any
2096                  * necessary precautions before we enter the tainted page
2097                  * into its address space.
2098                  */
2099                 kr = KERN_SUCCESS;
2100 #if CONFIG_ENFORCE_SIGNED_CODE
2101                 if (!cs_enforcement_disable) {
2102 #endif
2103                         if (cs_invalid_page((addr64_t) vaddr)) {
2104                                 /* reject the tainted page: abort the page fault */
2105                                 kr = KERN_MEMORY_ERROR;
2106                                 cs_enter_tainted_rejected++;
2107                         } else {
2108                                 /* proceed with the tainted page */
2109                                 kr = KERN_SUCCESS;
2110                                 cs_enter_tainted_accepted++;
2111                         }
2112 #if CONFIG_ENFORCE_SIGNED_CODE
2113                 }
2114 #endif
2115                 if (cs_debug || kr != KERN_SUCCESS) {
2116                         printf("CODESIGNING: vm_fault_enter(0x%llx): "
2117                                "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2118                                (long long)vaddr, m, m->object, m->offset);
2119                 }
2120         } else {
2121                 /* proceed with the valid page */
2122                 kr = KERN_SUCCESS;
2123         }
2124
2125         if (kr == KERN_SUCCESS) {
2126                 /*
2127                  * NOTE: we may only hold the vm_object lock SHARED
2128                  * at this point, but the update of pmapped is ok
2129                  * since this is the ONLY bit updated behind the SHARED
2130                  * lock... however, we need to figure out how to do an atomic
2131                  * update on a bit field to make this less fragile... right
2132                  * now I don't know how to coerce 'C' to give me the offset info
2133                  * that's needed for an AtomicCompareAndSwap
2134                  */
2135                 m->pmapped = TRUE;
2136                 if (prot & VM_PROT_WRITE) {
2137                         vm_object_lock_assert_exclusive(m->object);
2138                         m->wpmapped = TRUE;
2139                 }
2140
2141                 PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired);
2142         }
2143
2144         /*
2145          * Hold queues lock to manipulate
2146          * the page queues.  Change wiring
2147          * case is obvious.
2148          */
2149         if (change_wiring) {
2150                 vm_page_lockspin_queues();
2151
2152                 if (wired) {
2153                         if (kr == KERN_SUCCESS) {
2154                                 vm_page_wire(m);
2155                         }
2156                 } else {
2157                         vm_page_unwire(m);
2158                 }
2159                 vm_page_unlock_queues();
2160
2161         } else {
2162                 if (kr != KERN_SUCCESS) {
2163                         vm_page_lock_queues();
2164                         vm_page_deactivate(m);
2165                         vm_page_unlock_queues();
2166                 } else {
2167                         if (((!m->active && !m->inactive) || no_cache) && !m->wire_count && !m->throttled) {
2168                                 vm_page_lockspin_queues();
2169                                 /*
2170                                  * test again now that we hold the page queue lock
2171                                  */
2172                                 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count) {
2173
2174                                         /*
2175                                          * If this is a no_cache mapping and the page has never been
2176                                          * mapped before or was previously a no_cache page, then we
2177                                          * want to leave pages in the speculative state so that they
2178                                          * can be readily recycled if free memory runs low.  Otherwise
2179                                          * the page is activated as normal.
2180                                          */
2181
2182                                         if (no_cache && (!previously_pmapped || m->no_cache)) {
2183                                                 m->no_cache = TRUE;
2184
2185                                                 if (m->active || m->inactive)
2186                                                         VM_PAGE_QUEUES_REMOVE(m);
2187
2188                                                 if (!m->speculative)
2189                                                         vm_page_speculate(m, TRUE);
2190
2191                                         } else if (!m->active && !m->inactive)
2192                                                 vm_page_activate(m);
2193
2194                                 }
2195
2196                                 vm_page_unlock_queues();
2197                         }
2198                 }
2199         }
2200         return kr;
2201 }
2202
2203
2204 /*
2205  *      Routine:        vm_fault
2206  *      Purpose:
2207  *              Handle page faults, including pseudo-faults
2208  *              used to change the wiring status of pages.
2209  *      Returns:
2210  *              Explicit continuations have been removed.
2211  *      Implementation:
2212  *              vm_fault and vm_fault_page save mucho state
2213  *              in the moral equivalent of a closure.  The state
2214  *              structure is allocated when first entering vm_fault
2215  *              and deallocated when leaving vm_fault.
2216  */
2217
2218 extern int _map_enter_debug;
2219
2220 unsigned long vm_fault_collapse_total = 0;
2221 unsigned long vm_fault_collapse_skipped = 0;
2222
2223 kern_return_t
2224 vm_fault(
2225         vm_map_t        map,
2226         vm_map_offset_t vaddr,
2227         vm_prot_t       fault_type,
2228         boolean_t       change_wiring,
2229         int             interruptible,
2230         pmap_t          caller_pmap,
2231         vm_map_offset_t caller_pmap_addr)
2232 {
2233         vm_map_version_t        version;        /* Map version for verificiation */
2234         boolean_t               wired;          /* Should mapping be wired down? */
2235         vm_object_t             object;         /* Top-level object */
2236         vm_object_offset_t      offset;         /* Top-level offset */
2237         vm_prot_t               prot;           /* Protection for mapping */
2238         vm_object_t             old_copy_object; /* Saved copy object */
2239         vm_page_t               result_page;    /* Result of vm_fault_page */
2240         vm_page_t               top_page;       /* Placeholder page */
2241         kern_return_t           kr;
2242
2243         vm_page_t               m;      /* Fast access to result_page */
2244         kern_return_t           error_code;
2245         vm_object_t             cur_object;
2246         vm_object_offset_t      cur_offset;
2247         vm_page_t               cur_m;
2248         vm_object_t             new_object;
2249         int                     type_of_fault;
2250         pmap_t                  pmap;
2251         boolean_t               interruptible_state;
2252         vm_map_t                real_map = map;
2253         vm_map_t                original_map = map;
2254         vm_prot_t               original_fault_type;
2255         struct vm_object_fault_info fault_info;
2256         boolean_t               need_collapse = FALSE;
2257         int                     object_lock_type = 0;
2258         int                     cur_object_lock_type;
2259         vm_object_t             top_object = VM_OBJECT_NULL;
2260
2261
2262         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2263                               (int)((uint64_t)vaddr >> 32),
2264                               (int)vaddr,
2265                               0,
2266                               0,
2267                               0);
2268
2269         if (get_preemption_level() != 0) {
2270                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2271                                       (int)((uint64_t)vaddr >> 32),
2272                                       (int)vaddr,
2273                                       KERN_FAILURE,
2274                                       0,
2275                                       0);
2276
2277                 return (KERN_FAILURE);
2278         }
2279         interruptible_state = thread_interrupt_level(interruptible);
2280
2281         VM_STAT_INCR(faults);
2282         current_task()->faults++;
2283         original_fault_type = fault_type;
2284
2285         if (fault_type & VM_PROT_WRITE)
2286                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2287         else
2288                 object_lock_type = OBJECT_LOCK_SHARED;
2289
2290         cur_object_lock_type = OBJECT_LOCK_SHARED;
2291
2292 RetryFault:
2293         /*
2294          * assume we will hit a page in the cache
2295          * otherwise, explicitly override with
2296          * the real fault type once we determine it
2297          */
2298         type_of_fault = DBG_CACHE_HIT_FAULT;
2299
2300         /*
2301          *      Find the backing store object and offset into
2302          *      it to begin the search.
2303          */
2304         fault_type = original_fault_type;
2305         map = original_map;
2306         vm_map_lock_read(map);
2307
2308         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2309                                   object_lock_type, &version,
2310                                   &object, &offset, &prot, &wired,
2311                                   &fault_info,
2312                                   &real_map);
2313
2314         if (kr != KERN_SUCCESS) {
2315                 vm_map_unlock_read(map);
2316                 goto done;
2317         }
2318         pmap = real_map->pmap;
2319         fault_info.interruptible = interruptible;
2320
2321         /*
2322          * If the page is wired, we must fault for the current protection
2323          * value, to avoid further faults.
2324          */
2325         if (wired) {
2326                 fault_type = prot | VM_PROT_WRITE;
2327                 /*
2328                  * since we're treating this fault as a 'write'
2329                  * we must hold the top object lock exclusively
2330                  */
2331                 if (object_lock_type == OBJECT_LOCK_SHARED) {
2332
2333                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2334
2335                         if (vm_object_lock_upgrade(object) == FALSE) {
2336                                 /*
2337                                  * couldn't upgrade, so explictly
2338                                  * take the lock exclusively
2339                                  */
2340                                 vm_object_lock(object);
2341                         }
2342                 }
2343         }
2344
2345 #if     VM_FAULT_CLASSIFY
2346         /*
2347          *      Temporary data gathering code
2348          */
2349         vm_fault_classify(object, offset, fault_type);
2350 #endif
2351         /*
2352          *      Fast fault code.  The basic idea is to do as much as
2353          *      possible while holding the map lock and object locks.
2354          *      Busy pages are not used until the object lock has to
2355          *      be dropped to do something (copy, zero fill, pmap enter).
2356          *      Similarly, paging references aren't acquired until that
2357          *      point, and object references aren't used.
2358          *
2359          *      If we can figure out what to do
2360          *      (zero fill, copy on write, pmap enter) while holding
2361          *      the locks, then it gets done.  Otherwise, we give up,
2362          *      and use the original fault path (which doesn't hold
2363          *      the map lock, and relies on busy pages).
2364          *      The give up cases include:
2365          *              - Have to talk to pager.
2366          *              - Page is busy, absent or in error.
2367          *              - Pager has locked out desired access.
2368          *              - Fault needs to be restarted.
2369          *              - Have to push page into copy object.
2370          *
2371          *      The code is an infinite loop that moves one level down
2372          *      the shadow chain each time.  cur_object and cur_offset
2373          *      refer to the current object being examined. object and offset
2374          *      are the original object from the map.  The loop is at the
2375          *      top level if and only if object and cur_object are the same.
2376          *
2377          *      Invariants:  Map lock is held throughout.  Lock is held on
2378          *              original object and cur_object (if different) when
2379          *              continuing or exiting loop.
2380          *
2381          */
2382
2383
2384         /*
2385          * If this page is to be inserted in a copy delay object
2386          * for writing, and if the object has a copy, then the
2387          * copy delay strategy is implemented in the slow fault page.
2388          */
2389         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2390             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2391                 goto handle_copy_delay;
2392
2393         cur_object = object;
2394         cur_offset = offset;
2395
2396         while (TRUE) {
2397                 m = vm_page_lookup(cur_object, cur_offset);
2398
2399                 if (m != VM_PAGE_NULL) {
2400                         if (m->busy) {
2401                                 wait_result_t   result;
2402
2403                                 /*
2404                                  * in order to do the PAGE_ASSERT_WAIT, we must
2405                                  * have object that 'm' belongs to locked exclusively
2406                                  */
2407                                 if (object != cur_object) {
2408                                         vm_object_unlock(object);
2409
2410                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2411
2412                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2413
2414                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2415                                                         /*
2416                                                          * couldn't upgrade so go do a full retry
2417                                                          * immediately since we've already dropped
2418                                                          * the top object lock associated with this page
2419                                                          * and the current one got dropped due to the
2420                                                          * failed upgrade... the state is no longer valid
2421                                                          */
2422                                                         vm_map_unlock_read(map);
2423                                                         if (real_map != map)
2424                                                                 vm_map_unlock(real_map);
2425
2426                                                         goto RetryFault;
2427                                                 }
2428                                         }
2429                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2430
2431                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2432
2433                                         if (vm_object_lock_upgrade(object) == FALSE) {
2434                                                 /*
2435                                                  * couldn't upgrade, so explictly take the lock
2436                                                  * exclusively and go relookup the page since we
2437                                                  * will have dropped the object lock and
2438                                                  * a different thread could have inserted
2439                                                  * a page at this offset
2440                                                  * no need for a full retry since we're
2441                                                  * at the top level of the object chain
2442                                                  */
2443                                                 vm_object_lock(object);
2444
2445                                                 continue;
2446                                         }
2447                                 }
2448                                 vm_map_unlock_read(map);
2449                                 if (real_map != map)
2450                                         vm_map_unlock(real_map);
2451
2452                                 result = PAGE_ASSERT_WAIT(m, interruptible);
2453
2454                                 vm_object_unlock(cur_object);
2455
2456                                 if (result == THREAD_WAITING) {
2457                                         result = thread_block(THREAD_CONTINUE_NULL);
2458
2459                                         counter(c_vm_fault_page_block_busy_kernel++);
2460                                 }
2461                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2462                                         goto RetryFault;
2463
2464                                 kr = KERN_ABORTED;
2465                                 goto done;
2466                         }
2467                         if (m->phys_page == vm_page_guard_addr) {
2468                                 /*
2469                                  * Guard page: let the slow path deal with it
2470                                  */
2471                                 break;
2472                         }
2473                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
2474                                 /*
2475                                  * Unusual case... let the slow path deal with it
2476                                  */
2477                                 break;
2478                         }
2479                         if (m->encrypted) {
2480                                 /*
2481                                  * ENCRYPTED SWAP:
2482                                  * We've soft-faulted (because it's not in the page
2483                                  * table) on an encrypted page.
2484                                  * Keep the page "busy" so that no one messes with
2485                                  * it during the decryption.
2486                                  * Release the extra locks we're holding, keep only
2487                                  * the page's VM object lock.
2488                                  *
2489                                  * in order to set 'busy' on 'm', we must
2490                                  * have object that 'm' belongs to locked exclusively
2491                                  */
2492                                 if (object != cur_object) {
2493                                         vm_object_unlock(object);
2494
2495                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2496
2497                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2498
2499                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2500                                                         /*
2501                                                          * couldn't upgrade so go do a full retry
2502                                                          * immediately since we've already dropped
2503                                                          * the top object lock associated with this page
2504                                                          * and the current one got dropped due to the
2505                                                          * failed upgrade... the state is no longer valid
2506                                                          */
2507                                                         vm_map_unlock_read(map);
2508                                                         if (real_map != map)
2509                                                                 vm_map_unlock(real_map);
2510
2511                                                         goto RetryFault;
2512                                                 }
2513                                         }
2514                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2515
2516                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2517
2518                                         if (vm_object_lock_upgrade(object) == FALSE) {
2519                                                 /*
2520                                                  * couldn't upgrade, so explictly take the lock
2521                                                  * exclusively and go relookup the page since we
2522                                                  * will have dropped the object lock and
2523                                                  * a different thread could have inserted
2524                                                  * a page at this offset
2525                                                  * no need for a full retry since we're
2526                                                  * at the top level of the object chain
2527                                                  */
2528                                                 vm_object_lock(object);
2529
2530                                                 continue;
2531                                         }
2532                                 }
2533                                 m->busy = TRUE;
2534
2535                                 vm_map_unlock_read(map);
2536                                 if (real_map != map)
2537                                         vm_map_unlock(real_map);
2538
2539                                 vm_page_decrypt(m, 0);
2540
2541                                 assert(m->busy);
2542                                 PAGE_WAKEUP_DONE(m);
2543
2544                                 vm_object_unlock(cur_object);
2545                                 /*
2546                                  * Retry from the top, in case anything
2547                                  * changed while we were decrypting...
2548                                  */
2549                                 goto RetryFault;
2550                         }
2551                         ASSERT_PAGE_DECRYPTED(m);
2552
2553                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
2554                                 /*
2555                                  * We might need to validate this page
2556                                  * against its code signature, so we
2557                                  * want to hold the VM object exclusively.
2558                                  */
2559                                 if (object != cur_object) {
2560                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2561                                                 vm_object_unlock(object);
2562                                                 vm_object_unlock(cur_object);
2563
2564                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2565
2566                                                 vm_map_unlock_read(map);
2567                                                 if (real_map != map)
2568                                                         vm_map_unlock(real_map);
2569
2570                                                 goto RetryFault;
2571                                         }
2572
2573                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2574
2575                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2576
2577                                         if (vm_object_lock_upgrade(object) == FALSE) {
2578                                                 /*
2579                                                  * couldn't upgrade, so explictly take the lock
2580                                                  * exclusively and go relookup the page since we
2581                                                  * will have dropped the object lock and
2582                                                  * a different thread could have inserted
2583                                                  * a page at this offset
2584                                                  * no need for a full retry since we're
2585                                                  * at the top level of the object chain
2586                                                  */
2587                                                 vm_object_lock(object);
2588
2589                                                 continue;
2590                                         }
2591                                 }
2592                         }
2593                         /*
2594                          *      Two cases of map in faults:
2595                          *          - At top level w/o copy object.
2596                          *          - Read fault anywhere.
2597                          *              --> must disallow write.
2598                          */
2599
2600                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
2601                                 if ((fault_type & VM_PROT_WRITE) == 0) {
2602                                         /*
2603                                          * This is not a "write" fault, so we
2604                                          * might not have taken the object lock
2605                                          * exclusively and we might not be able
2606                                          * to update the "wpmapped" bit in
2607                                          * vm_fault_enter().
2608                                          * Let's just grant read access to
2609                                          * the page for now and we'll
2610                                          * soft-fault again if we need write
2611                                          * access later...
2612                                          */
2613                                         prot &= ~VM_PROT_WRITE;
2614                                 }
2615                                 goto FastPmapEnter;
2616                         }
2617
2618                         if ((fault_type & VM_PROT_WRITE) == 0) {
2619
2620                                 prot &= ~VM_PROT_WRITE;
2621
2622                                 if (object != cur_object) {
2623                                         /*
2624                                          * We still need to hold the top object
2625                                          * lock here to prevent a race between
2626                                          * a read fault (taking only "shared"
2627                                          * locks) and a write fault (taking
2628                                          * an "exclusive" lock on the top
2629                                          * object.
2630                                          * Otherwise, as soon as we release the
2631                                          * top lock, the write fault could
2632                                          * proceed and actually complete before
2633                                          * the read fault, and the copied page's
2634                                          * translation could then be overwritten
2635                                          * by the read fault's translation for
2636                                          * the original page.
2637                                          *
2638                                          * Let's just record what the top object
2639                                          * is and we'll release it later.
2640                                          */
2641                                         top_object = object;
2642
2643                                         /*
2644                                          * switch to the object that has the new page
2645                                          */
2646                                         object = cur_object;
2647                                         object_lock_type = cur_object_lock_type;
2648                                 }
2649 FastPmapEnter:
2650                                 /*
2651                                  * prepare for the pmap_enter...
2652                                  * object and map are both locked
2653                                  * m contains valid data
2654                                  * object == m->object
2655                                  * cur_object == NULL or it's been unlocked
2656                                  * no paging references on either object or cur_object
2657                                  */
2658 #if     MACH_KDB
2659                                 if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0)
2660                                         prot &= ~VM_PROT_WRITE;
2661 #endif
2662                                 if (caller_pmap) {
2663                                         kr = vm_fault_enter(m,
2664                                                             caller_pmap,
2665                                                             caller_pmap_addr,
2666                                                             prot,
2667                                                             wired,
2668                                                             change_wiring,
2669                                                             fault_info.no_cache,
2670                                                             &type_of_fault);
2671                                 } else {
2672                                         kr = vm_fault_enter(m,
2673                                                             pmap,
2674                                                             vaddr,
2675                                                             prot,
2676                                                             wired,
2677                                                             change_wiring,
2678                                                             fault_info.no_cache,
2679                                                             &type_of_fault);
2680                                 }
2681
2682                                 if (top_object != VM_OBJECT_NULL) {
2683                                         /*
2684                                          * It's safe to drop the top object
2685                                          * now that we've done our
2686                                          * vm_fault_enter().  Any other fault
2687                                          * in progress for that virtual
2688                                          * address will either find our page
2689                                          * and translation or put in a new page
2690                                          * and translation.
2691                                          */
2692                                         vm_object_unlock(top_object);
2693                                         top_object = VM_OBJECT_NULL;
2694                                 }
2695
2696                                 if (need_collapse == TRUE)
2697                                         vm_object_collapse(object, offset, TRUE);
2698
2699                                 if (type_of_fault == DBG_PAGEIN_FAULT) {
2700                                         /*
2701                                          * evaluate access pattern and update state
2702                                          * vm_fault_deactivate_behind depends on the
2703                                          * state being up to date
2704                                          */
2705                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
2706
2707                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
2708                                 }
2709                                 /*
2710                                  * That's it, clean up and return.
2711                                  */
2712                                 if (m->busy)
2713                                         PAGE_WAKEUP_DONE(m);
2714
2715                                 vm_object_unlock(object);
2716
2717                                 vm_map_unlock_read(map);
2718                                 if (real_map != map)
2719                                         vm_map_unlock(real_map);
2720
2721                                 goto done;
2722                         }
2723                         /*
2724                          * COPY ON WRITE FAULT
2725                          *
2726                          * If objects match, then
2727                          * object->copy must not be NULL (else control
2728                          * would be in previous code block), and we
2729                          * have a potential push into the copy object
2730                          * with which we can't cope with here.
2731                          */
2732                         if (cur_object == object) {
2733                                 /*
2734                                  * must take the slow path to
2735                                  * deal with the copy push
2736                                  */
2737                                 break;
2738                         }
2739                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
2740
2741                         /*
2742                          * This is now a shadow based copy on write
2743                          * fault -- it requires a copy up the shadow
2744                          * chain.
2745                          *
2746                          * Allocate a page in the original top level
2747                          * object. Give up if allocate fails.  Also
2748                          * need to remember current page, as it's the
2749                          * source of the copy.
2750                          *
2751                          * at this point we hold locks on both
2752                          * object and cur_object... no need to take
2753                          * paging refs or mark pages BUSY since
2754                          * we don't drop either object lock until
2755                          * the page has been copied and inserted
2756                          */
2757                         cur_m = m;
2758                         m = vm_page_grab();
2759
2760                         if (m == VM_PAGE_NULL) {
2761                                 /*
2762                                  * no free page currently available...
2763                                  * must take the slow path
2764                                  */
2765                                 break;
2766                         }
2767                         /*
2768                          * Now do the copy.  Mark the source page busy...
2769                          *
2770                          *      NOTE: This code holds the map lock across
2771                          *      the page copy.
2772                          */
2773                         vm_page_copy(cur_m, m);
2774                         vm_page_insert(m, object, offset);
2775                         m->dirty = TRUE;
2776
2777                         /*
2778                          * Now cope with the source page and object
2779                          */
2780                         if (object->ref_count > 1 && cur_m->pmapped)
2781                                 pmap_disconnect(cur_m->phys_page);
2782
2783                         need_collapse = TRUE;
2784
2785                         if (!cur_object->internal &&
2786                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
2787                                 /*
2788                                  * The object from which we've just
2789                                  * copied a page is most probably backed
2790                                  * by a vnode.  We don't want to waste too
2791                                  * much time trying to collapse the VM objects
2792                                  * and create a bottleneck when several tasks
2793                                  * map the same file.
2794                                  */
2795                                 if (cur_object->copy == object) {
2796                                         /*
2797                                          * Shared mapping or no COW yet.
2798                                          * We can never collapse a copy
2799                                          * object into its backing object.
2800                                          */
2801                                         need_collapse = FALSE;
2802                                 } else if (cur_object->copy == object->shadow &&
2803                                            object->shadow->resident_page_count == 0) {
2804                                         /*
2805                                          * Shared mapping after a COW occurred.
2806                                          */
2807                                         need_collapse = FALSE;
2808                                 }
2809                         }
2810                         vm_object_unlock(cur_object);
2811
2812                         if (need_collapse == FALSE)
2813                                 vm_fault_collapse_skipped++;
2814                         vm_fault_collapse_total++;
2815
2816                         type_of_fault = DBG_COW_FAULT;
2817                         VM_STAT_INCR(cow_faults);
2818                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2819                         current_task()->cow_faults++;
2820
2821                         goto FastPmapEnter;
2822
2823                 } else {
2824                         /*
2825                          * No page at cur_object, cur_offset... m == NULL
2826                          */
2827                         if (cur_object->pager_created) {
2828                                 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
2829                                         /*
2830                                          * May have to talk to a pager...
2831                                          * take the slow path.
2832                                          */
2833                                         break;
2834                                 }
2835                                 /*
2836                                  * existence map present and indicates
2837                                  * that the pager doesn't have this page
2838                                  */
2839                         }
2840                         if (cur_object->shadow == VM_OBJECT_NULL) {
2841                                 /*
2842                                  * Zero fill fault.  Page gets
2843                                  * inserted into the original object.
2844                                  */
2845                                 if (cur_object->shadow_severed) {
2846
2847                                         if (object != cur_object)
2848                                                 vm_object_unlock(cur_object);
2849                                         vm_object_unlock(object);
2850
2851                                         vm_map_unlock_read(map);
2852                                         if (real_map != map)
2853                                                 vm_map_unlock(real_map);
2854
2855                                         kr = KERN_MEMORY_ERROR;
2856                                         goto done;
2857                                 }
2858                                 if (VM_PAGE_ZFILL_THROTTLED()) {
2859                                         /*
2860                                          * drop all of our locks...
2861                                          * wait until the free queue is
2862                                          * pumped back up and then
2863                                          * redrive the fault
2864                                          */
2865                                         if (object != cur_object)
2866                                                 vm_object_unlock(cur_object);
2867                                         vm_object_unlock(object);
2868                                         vm_map_unlock_read(map);
2869                                         if (real_map != map)
2870                                                 vm_map_unlock(real_map);
2871
2872                                         if (vm_page_wait((change_wiring) ?
2873                                                          THREAD_UNINT :
2874                                                          THREAD_ABORTSAFE))
2875                                                 goto RetryFault;
2876
2877                                         kr = KERN_ABORTED;
2878                                         goto done;
2879                                 }
2880                                 if (vm_backing_store_low) {
2881                                         /*
2882                                          * we are protecting the system from
2883                                          * backing store exhaustion...
2884                                          * must take the slow path if we're
2885                                          * not privileged
2886                                          */
2887                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
2888                                                 break;
2889                                 }
2890                                 if (cur_object != object) {
2891                                         vm_object_unlock(cur_object);
2892
2893                                         cur_object = object;
2894                                 }
2895                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
2896
2897                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2898
2899                                         if (vm_object_lock_upgrade(object) == FALSE) {
2900                                                 /*
2901                                                  * couldn't upgrade so do a full retry on the fault
2902                                                  * since we dropped the object lock which
2903                                                  * could allow another thread to insert
2904                                                  * a page at this offset
2905                                                  */
2906                                                 vm_map_unlock_read(map);
2907                                                 if (real_map != map)
2908                                                         vm_map_unlock(real_map);
2909
2910                                                 goto RetryFault;
2911                                         }
2912                                 }
2913                                 m = vm_page_alloc(object, offset);
2914
2915                                 if (m == VM_PAGE_NULL) {
2916                                         /*
2917                                          * no free page currently available...
2918                                          * must take the slow path
2919                                          */
2920                                         break;
2921                                 }
2922
2923                                 /*
2924                                  * Now zero fill page...
2925                                  * the page is probably going to
2926                                  * be written soon, so don't bother
2927                                  * to clear the modified bit
2928                                  *
2929                                  *   NOTE: This code holds the map
2930                                  *   lock across the zero fill.
2931                                  */
2932                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
2933
2934                                 goto FastPmapEnter;
2935                         }
2936                         /*
2937                          * On to the next level in the shadow chain
2938                          */
2939                         cur_offset += cur_object->shadow_offset;
2940                         new_object = cur_object->shadow;
2941
2942                         /*
2943                          * take the new_object's lock with the indicated state
2944                          */
2945                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
2946                                 vm_object_lock_shared(new_object);
2947                         else
2948                                 vm_object_lock(new_object);
2949
2950                         if (cur_object != object)
2951                                 vm_object_unlock(cur_object);
2952
2953                         cur_object = new_object;
2954
2955                         continue;
2956                 }
2957         }
2958         /*
2959          * Cleanup from fast fault failure.  Drop any object
2960          * lock other than original and drop map lock.
2961          */
2962         if (object != cur_object)
2963                 vm_object_unlock(cur_object);
2964
2965         /*
2966          * must own the object lock exclusively at this point
2967          */
2968         if (object_lock_type == OBJECT_LOCK_SHARED) {
2969                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2970
2971                 if (vm_object_lock_upgrade(object) == FALSE) {
2972                         /*
2973                          * couldn't upgrade, so explictly
2974                          * take the lock exclusively
2975                          * no need to retry the fault at this
2976                          * point since "vm_fault_page" will
2977                          * completely re-evaluate the state
2978                          */
2979                         vm_object_lock(object);
2980                 }
2981         }
2982
2983 handle_copy_delay:
2984         vm_map_unlock_read(map);
2985         if (real_map != map)
2986                 vm_map_unlock(real_map);
2987
2988         /*
2989          * Make a reference to this object to
2990          * prevent its disposal while we are messing with
2991          * it.  Once we have the reference, the map is free
2992          * to be diddled.  Since objects reference their
2993          * shadows (and copies), they will stay around as well.
2994          */
2995         vm_object_reference_locked(object);
2996         vm_object_paging_begin(object);
2997
2998         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2999
3000         error_code = 0;
3001
3002         kr = vm_fault_page(object, offset, fault_type,
3003                            (change_wiring && !wired),
3004                            &prot, &result_page, &top_page,
3005                            &type_of_fault,
3006                            &error_code, map->no_zero_fill,
3007                            FALSE, &fault_info);
3008
3009         /*
3010          * if kr != VM_FAULT_SUCCESS, then the paging reference
3011          * has been dropped and the object unlocked... the ref_count
3012          * is still held
3013          *
3014          * if kr == VM_FAULT_SUCCESS, then the paging reference
3015          * is still held along with the ref_count on the original object
3016          *
3017          *      if m != NULL, then the object it belongs to
3018          *      is returned locked with a paging reference
3019          *
3020          *      if top_page != NULL, then it's BUSY and the
3021          *      object it belongs to has a paging reference
3022          *      but is returned unlocked
3023          */
3024         if (kr != VM_FAULT_SUCCESS) {
3025                 /*
3026                  * we didn't succeed, lose the object reference immediately.
3027                  */
3028                 vm_object_deallocate(object);
3029
3030                 /*
3031                  * See why we failed, and take corrective action.
3032                  */
3033                 switch (kr) {
3034                 case VM_FAULT_MEMORY_SHORTAGE:
3035                         if (vm_page_wait((change_wiring) ?
3036                                          THREAD_UNINT :
3037                                          THREAD_ABORTSAFE))
3038                                 goto RetryFault;
3039                         /*
3040                          * fall thru
3041                          */
3042                 case VM_FAULT_INTERRUPTED:
3043                         kr = KERN_ABORTED;
3044                         goto done;
3045                 case VM_FAULT_RETRY:
3046                         goto RetryFault;
3047                 case VM_FAULT_MEMORY_ERROR:
3048                         if (error_code)
3049                                 kr = error_code;
3050                         else
3051                                 kr = KERN_MEMORY_ERROR;
3052                         goto done;
3053                 }
3054         }
3055         m = result_page;
3056
3057         if (m != VM_PAGE_NULL) {
3058                 assert((change_wiring && !wired) ?
3059                     (top_page == VM_PAGE_NULL) :
3060                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
3061         }
3062
3063         /*
3064          * What to do with the resulting page from vm_fault_page
3065          * if it doesn't get entered into the physical map:
3066          */
3067 #define RELEASE_PAGE(m)                                 \
3068         MACRO_BEGIN                                     \
3069         PAGE_WAKEUP_DONE(m);                            \
3070         vm_page_lockspin_queues();                      \
3071         if (!m->active && !m->inactive && !m->throttled)\
3072                 vm_page_activate(m);                    \
3073         vm_page_unlock_queues();                        \
3074         MACRO_END
3075
3076         /*
3077          * We must verify that the maps have not changed
3078          * since our last lookup.
3079          */
3080         if (m != VM_PAGE_NULL) {
3081                 old_copy_object = m->object->copy;
3082                 vm_object_unlock(m->object);
3083         } else
3084                 old_copy_object = VM_OBJECT_NULL;
3085
3086         /*
3087          * no object locks are held at this point
3088          */
3089         if ((map != original_map) || !vm_map_verify(map, &version)) {
3090                 vm_object_t             retry_object;
3091                 vm_object_offset_t      retry_offset;
3092                 vm_prot_t               retry_prot;
3093
3094                 /*
3095                  * To avoid trying to write_lock the map while another
3096                  * thread has it read_locked (in vm_map_pageable), we
3097                  * do not try for write permission.  If the page is
3098                  * still writable, we will get write permission.  If it
3099                  * is not, or has been marked needs_copy, we enter the
3100                  * mapping without write permission, and will merely
3101                  * take another fault.
3102                  */
3103                 map = original_map;
3104                 vm_map_lock_read(map);
3105
3106                 kr = vm_map_lookup_locked(&map, vaddr,
3107                                           fault_type & ~VM_PROT_WRITE,
3108                                           OBJECT_LOCK_EXCLUSIVE, &version,
3109                                           &retry_object, &retry_offset, &retry_prot,
3110                                           &wired,
3111                                           &fault_info,
3112                                           &real_map);
3113                 pmap = real_map->pmap;
3114
3115                 if (kr != KERN_SUCCESS) {
3116                         vm_map_unlock_read(map);
3117
3118                         if (m != VM_PAGE_NULL) {
3119                                 /*
3120                                  * retake the lock so that
3121                                  * we can drop the paging reference
3122                                  * in vm_fault_cleanup and do the
3123                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3124                                  */
3125                                 vm_object_lock(m->object);
3126
3127                                 RELEASE_PAGE(m);
3128
3129                                 vm_fault_cleanup(m->object, top_page);
3130                         } else {
3131                                 /*
3132                                  * retake the lock so that
3133                                  * we can drop the paging reference
3134                                  * in vm_fault_cleanup
3135                                  */
3136                                 vm_object_lock(object);
3137
3138                                 vm_fault_cleanup(object, top_page);
3139                         }
3140                         vm_object_deallocate(object);
3141
3142                         goto done;
3143                 }
3144                 vm_object_unlock(retry_object);
3145
3146                 if ((retry_object != object) || (retry_offset != offset)) {
3147
3148                         vm_map_unlock_read(map);
3149                         if (real_map != map)
3150                                 vm_map_unlock(real_map);
3151
3152                         if (m != VM_PAGE_NULL) {
3153                                 /*
3154                                  * retake the lock so that
3155                                  * we can drop the paging reference
3156                                  * in vm_fault_cleanup and do the
3157                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3158                                  */
3159                                 vm_object_lock(m->object);
3160
3161                                 RELEASE_PAGE(m);
3162
3163                                 vm_fault_cleanup(m->object, top_page);
3164                         } else {
3165                                 /*
3166                                  * retake the lock so that
3167                                  * we can drop the paging reference
3168                                  * in vm_fault_cleanup
3169                                  */
3170                                 vm_object_lock(object);
3171
3172                                 vm_fault_cleanup(object, top_page);
3173                         }
3174                         vm_object_deallocate(object);
3175
3176                         goto RetryFault;
3177                 }
3178                 /*
3179                  * Check whether the protection has changed or the object
3180                  * has been copied while we left the map unlocked.
3181                  */
3182                 prot &= retry_prot;
3183         }
3184         if (m != VM_PAGE_NULL) {
3185                 vm_object_lock(m->object);
3186
3187                 if (m->object->copy != old_copy_object) {
3188                         /*
3189                          * The copy object changed while the top-level object
3190                          * was unlocked, so take away write permission.
3191                          */
3192                         prot &= ~VM_PROT_WRITE;
3193                 }
3194         } else
3195                 vm_object_lock(object);
3196
3197         /*
3198          * If we want to wire down this page, but no longer have
3199          * adequate permissions, we must start all over.
3200          */
3201         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3202
3203                 vm_map_verify_done(map, &version);
3204                 if (real_map != map)
3205                         vm_map_unlock(real_map);
3206
3207                 if (m != VM_PAGE_NULL) {
3208                         RELEASE_PAGE(m);
3209
3210                         vm_fault_cleanup(m->object, top_page);
3211                 } else
3212                         vm_fault_cleanup(object, top_page);
3213
3214                 vm_object_deallocate(object);
3215
3216                 goto RetryFault;
3217         }
3218         if (m != VM_PAGE_NULL) {
3219                 /*
3220                  * Put this page into the physical map.
3221                  * We had to do the unlock above because pmap_enter
3222                  * may cause other faults.  The page may be on
3223                  * the pageout queues.  If the pageout daemon comes
3224                  * across the page, it will remove it from the queues.
3225                  */
3226                 if (caller_pmap) {
3227                         kr = vm_fault_enter(m,
3228                                             caller_pmap,
3229                                             caller_pmap_addr,
3230                                             prot,
3231                                             wired,
3232                                             change_wiring,
3233                                             fault_info.no_cache,
3234                                             &type_of_fault);
3235                 } else {
3236                         kr = vm_fault_enter(m,
3237                                             pmap,
3238                                             vaddr,
3239                                             prot,
3240                                             wired,
3241                                             change_wiring,
3242                                             fault_info.no_cache,
3243                                             &type_of_fault);
3244                 }
3245                 if (kr != KERN_SUCCESS) {
3246                         /* abort this page fault */
3247                         vm_map_verify_done(map, &version);
3248                         if (real_map != map)
3249                                 vm_map_unlock(real_map);
3250                         PAGE_WAKEUP_DONE(m);
3251                         vm_fault_cleanup(m->object, top_page);
3252                         vm_object_deallocate(object);
3253                         goto done;
3254                 }
3255         } else {
3256
3257                 vm_map_entry_t          entry;
3258                 vm_map_offset_t         laddr;
3259                 vm_map_offset_t         ldelta, hdelta;
3260
3261                 /*
3262                  * do a pmap block mapping from the physical address
3263                  * in the object
3264                  */
3265
3266 #ifdef ppc
3267                 /* While we do not worry about execution protection in   */
3268                 /* general, certian pages may have instruction execution */
3269                 /* disallowed.  We will check here, and if not allowed   */
3270                 /* to execute, we return with a protection failure.      */
3271
3272                 if ((fault_type & VM_PROT_EXECUTE) &&
3273                         (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) {
3274
3275                         vm_map_verify_done(map, &version);
3276
3277                         if (real_map != map)
3278                                 vm_map_unlock(real_map);
3279
3280                         vm_fault_cleanup(object, top_page);
3281                         vm_object_deallocate(object);
3282
3283                         kr = KERN_PROTECTION_FAILURE;
3284                         goto done;
3285                 }
3286 #endif  /* ppc */
3287
3288                 if (real_map != map)
3289                         vm_map_unlock(real_map);
3290
3291                 if (original_map != map) {
3292                         vm_map_unlock_read(map);
3293                         vm_map_lock_read(original_map);
3294                         map = original_map;
3295                 }
3296                 real_map = map;
3297
3298                 laddr = vaddr;
3299                 hdelta = 0xFFFFF000;
3300                 ldelta = 0xFFFFF000;
3301
3302                 while (vm_map_lookup_entry(map, laddr, &entry)) {
3303                         if (ldelta > (laddr - entry->vme_start))
3304                                 ldelta = laddr - entry->vme_start;
3305                         if (hdelta > (entry->vme_end - laddr))
3306                                 hdelta = entry->vme_end - laddr;
3307                         if (entry->is_sub_map) {
3308
3309                                 laddr = (laddr - entry->vme_start)
3310                                                         + entry->offset;
3311                                 vm_map_lock_read(entry->object.sub_map);
3312
3313                                 if (map != real_map)
3314                                         vm_map_unlock_read(map);
3315                                 if (entry->use_pmap) {
3316                                         vm_map_unlock_read(real_map);
3317                                         real_map = entry->object.sub_map;
3318                                 }
3319                                 map = entry->object.sub_map;
3320
3321                         } else {
3322                                 break;
3323                         }
3324                 }
3325
3326                 if (vm_map_lookup_entry(map, laddr, &entry) &&
3327                                         (entry->object.vm_object != NULL) &&
3328                                         (entry->object.vm_object == object)) {
3329
3330                         if (caller_pmap) {
3331                                 /*
3332                                  * Set up a block mapped area
3333                                  */
3334                                 pmap_map_block(caller_pmap,
3335                                                (addr64_t)(caller_pmap_addr - ldelta),
3336                                                (((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) +
3337                                                 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3338                                                ((ldelta + hdelta) >> 12), prot,
3339                                                (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3340                         } else {
3341                                 /*
3342                                  * Set up a block mapped area
3343                                  */
3344                                 pmap_map_block(real_map->pmap,
3345                                                (addr64_t)(vaddr - ldelta),
3346                                                (((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) +
3347                                                 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3348                                                ((ldelta + hdelta) >> 12), prot,
3349                                                (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3350                         }
3351                 }
3352         }
3353
3354         /*
3355          * Unlock everything, and return
3356          */
3357         vm_map_verify_done(map, &version);
3358         if (real_map != map)
3359                 vm_map_unlock(real_map);
3360
3361         if (m != VM_PAGE_NULL) {
3362                 PAGE_WAKEUP_DONE(m);
3363
3364                 vm_fault_cleanup(m->object, top_page);
3365         } else
3366                 vm_fault_cleanup(object, top_page);
3367
3368         vm_object_deallocate(object);
3369
3370 #undef  RELEASE_PAGE
3371
3372         kr = KERN_SUCCESS;
3373 done:
3374         thread_interrupt_level(interruptible_state);
3375
3376         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3377                               (int)((uint64_t)vaddr >> 32),
3378                               (int)vaddr,
3379                               kr,
3380                               type_of_fault,
3381                               0);
3382
3383         return (kr);
3384 }
3385
3386 /*
3387  *      vm_fault_wire:
3388  *
3389  *      Wire down a range of virtual addresses in a map.
3390  */
3391 kern_return_t
3392 vm_fault_wire(
3393         vm_map_t        map,
3394         vm_map_entry_t  entry,
3395         pmap_t          pmap,
3396         vm_map_offset_t pmap_addr)
3397 {
3398
3399         register vm_map_offset_t        va;
3400         register vm_map_offset_t        end_addr = entry->vme_end;
3401         register kern_return_t  rc;
3402
3403         assert(entry->in_transition);
3404
3405         if ((entry->object.vm_object != NULL) &&
3406                         !entry->is_sub_map &&
3407                         entry->object.vm_object->phys_contiguous) {
3408                 return KERN_SUCCESS;
3409         }
3410
3411         /*
3412          *      Inform the physical mapping system that the
3413          *      range of addresses may not fault, so that
3414          *      page tables and such can be locked down as well.
3415          */
3416
3417         pmap_pageable(pmap, pmap_addr,
3418                 pmap_addr + (end_addr - entry->vme_start), FALSE);
3419
3420         /*
3421          *      We simulate a fault to get the page and enter it
3422          *      in the physical map.
3423          */
3424
3425         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3426                 if ((rc = vm_fault_wire_fast(
3427                         map, va, entry, pmap,
3428                         pmap_addr + (va - entry->vme_start)
3429                         )) != KERN_SUCCESS) {
3430                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3431                                 (pmap == kernel_pmap) ?
3432                                         THREAD_UNINT : THREAD_ABORTSAFE,
3433                                 pmap, pmap_addr + (va - entry->vme_start));
3434                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
3435                 }
3436
3437                 if (rc != KERN_SUCCESS) {
3438                         struct vm_map_entry     tmp_entry = *entry;
3439
3440                         /* unwire wired pages */
3441                         tmp_entry.vme_end = va;
3442                         vm_fault_unwire(map,
3443                                 &tmp_entry, FALSE, pmap, pmap_addr);
3444
3445                         return rc;
3446                 }
3447         }
3448         return KERN_SUCCESS;
3449 }
3450
3451 /*
3452  *      vm_fault_unwire:
3453  *
3454  *      Unwire a range of virtual addresses in a map.
3455  */
3456 void
3457 vm_fault_unwire(
3458         vm_map_t        map,
3459         vm_map_entry_t  entry,
3460         boolean_t       deallocate,
3461         pmap_t          pmap,
3462         vm_map_offset_t pmap_addr)
3463 {
3464         register vm_map_offset_t        va;
3465         register vm_map_offset_t        end_addr = entry->vme_end;
3466         vm_object_t             object;
3467         struct vm_object_fault_info fault_info;
3468
3469         object = (entry->is_sub_map)
3470                         ? VM_OBJECT_NULL : entry->object.vm_object;
3471
3472         /*
3473          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
3474          * do anything since such memory is wired by default.  So we don't have
3475          * anything to undo here.
3476          */
3477
3478         if (object != VM_OBJECT_NULL && object->phys_contiguous)
3479                 return;
3480
3481         fault_info.interruptible = THREAD_UNINT;
3482         fault_info.behavior = entry->behavior;
3483         fault_info.user_tag = entry->alias;
3484         fault_info.lo_offset = entry->offset;
3485         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
3486         fault_info.no_cache = entry->no_cache;
3487
3488         /*
3489          *      Since the pages are wired down, we must be able to
3490          *      get their mappings from the physical map system.
3491          */
3492
3493         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3494
3495                 if (object == VM_OBJECT_NULL) {
3496                         if (pmap) {
3497                                 pmap_change_wiring(pmap,
3498                                                    pmap_addr + (va - entry->vme_start), FALSE);
3499                         }
3500                         (void) vm_fault(map, va, VM_PROT_NONE,
3501                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
3502                 } else {
3503                         vm_prot_t       prot;
3504                         vm_page_t       result_page;
3505                         vm_page_t       top_page;
3506                         vm_object_t     result_object;
3507                         vm_fault_return_t result;
3508
3509                         fault_info.cluster_size = end_addr - va;
3510
3511                         do {
3512                                 prot = VM_PROT_NONE;
3513
3514                                 vm_object_lock(object);
3515                                 vm_object_paging_begin(object);
3516                                 XPR(XPR_VM_FAULT,
3517                                         "vm_fault_unwire -> vm_fault_page\n",
3518                                         0,0,0,0,0);
3519                                 result = vm_fault_page(
3520                                         object,
3521                                         entry->offset + (va - entry->vme_start),
3522                                         VM_PROT_NONE, TRUE,
3523                                         &prot, &result_page, &top_page,
3524                                         (int *)0,
3525                                         NULL, map->no_zero_fill,
3526                                         FALSE, &fault_info);
3527                         } while (result == VM_FAULT_RETRY);
3528
3529                         /*
3530                          * If this was a mapping to a file on a device that has been forcibly
3531                          * unmounted, then we won't get a page back from vm_fault_page().  Just
3532                          * move on to the next one in case the remaining pages are mapped from
3533                          * different objects.  During a forced unmount, the object is terminated
3534                          * so the alive flag will be false if this happens.  A forced unmount will
3535                          * will occur when an external disk is unplugged before the user does an
3536                          * eject, so we don't want to panic in that situation.
3537                          */
3538
3539                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
3540                                 continue;
3541
3542                         if (result != VM_FAULT_SUCCESS)
3543                                 panic("vm_fault_unwire: failure");
3544
3545                         result_object = result_page->object;
3546
3547                         if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) {
3548                                 pmap_change_wiring(pmap,
3549                                                    pmap_addr + (va - entry->vme_start), FALSE);
3550                         }
3551                         if (deallocate) {
3552                                 assert(result_page->phys_page !=
3553                                        vm_page_fictitious_addr);
3554                                 pmap_disconnect(result_page->phys_page);
3555                                 VM_PAGE_FREE(result_page);
3556                         } else {
3557                                 vm_page_lockspin_queues();
3558                                 vm_page_unwire(result_page);
3559                                 vm_page_unlock_queues();
3560                                 PAGE_WAKEUP_DONE(result_page);
3561                         }
3562                         vm_fault_cleanup(result_object, top_page);
3563                 }
3564         }
3565
3566         /*
3567          *      Inform the physical mapping system that the range
3568          *      of addresses may fault, so that page tables and
3569          *      such may be unwired themselves.
3570          */
3571
3572         pmap_pageable(pmap, pmap_addr,
3573                 pmap_addr + (end_addr - entry->vme_start), TRUE);
3574
3575 }
3576
3577 /*
3578  *      vm_fault_wire_fast:
3579  *
3580  *      Handle common case of a wire down page fault at the given address.
3581  *      If successful, the page is inserted into the associated physical map.
3582  *      The map entry is passed in to avoid the overhead of a map lookup.
3583  *
3584  *      NOTE: the given address should be truncated to the
3585  *      proper page address.
3586  *
3587  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
3588  *      a standard error specifying why the fault is fatal is returned.
3589  *
3590  *      The map in question must be referenced, and remains so.
3591  *      Caller has a read lock on the map.
3592  *
3593  *      This is a stripped version of vm_fault() for wiring pages.  Anything
3594  *      other than the common case will return KERN_FAILURE, and the caller
3595  *      is expected to call vm_fault().
3596  */
3597 kern_return_t
3598 vm_fault_wire_fast(
3599         __unused vm_map_t       map,
3600         vm_map_offset_t va,
3601         vm_map_entry_t  entry,
3602         pmap_t                  pmap,
3603         vm_map_offset_t pmap_addr)
3604 {
3605         vm_object_t             object;
3606         vm_object_offset_t      offset;
3607         register vm_page_t      m;
3608         vm_prot_t               prot;
3609         thread_t                thread = current_thread();
3610         int                     type_of_fault;
3611         kern_return_t           kr;
3612
3613         VM_STAT_INCR(faults);
3614
3615         if (thread != THREAD_NULL && thread->task != TASK_NULL)
3616           thread->task->faults++;
3617
3618 /*
3619  *      Recovery actions
3620  */
3621
3622 #undef  RELEASE_PAGE
3623 #define RELEASE_PAGE(m) {                               \
3624         PAGE_WAKEUP_DONE(m);                            \
3625         vm_page_lockspin_queues();                      \
3626         vm_page_unwire(m);                              \
3627         vm_page_unlock_queues();                        \
3628 }
3629
3630
3631 #undef  UNLOCK_THINGS
3632 #define UNLOCK_THINGS   {                               \
3633         vm_object_paging_end(object);                      \
3634         vm_object_unlock(object);                          \
3635 }
3636
3637 #undef  UNLOCK_AND_DEALLOCATE
3638 #define UNLOCK_AND_DEALLOCATE   {                       \
3639         UNLOCK_THINGS;                                  \
3640         vm_object_deallocate(object);                   \
3641 }
3642 /*
3643  *      Give up and have caller do things the hard way.
3644  */
3645
3646 #define GIVE_UP {                                       \
3647         UNLOCK_AND_DEALLOCATE;                          \
3648         return(KERN_FAILURE);                           \
3649 }
3650
3651
3652         /*
3653          *      If this entry is not directly to a vm_object, bail out.
3654          */
3655         if (entry->is_sub_map)
3656                 return(KERN_FAILURE);
3657
3658         /*
3659          *      Find the backing store object and offset into it.
3660          */
3661
3662         object = entry->object.vm_object;
3663         offset = (va - entry->vme_start) + entry->offset;
3664         prot = entry->protection;
3665
3666         /*
3667          *      Make a reference to this object to prevent its
3668          *      disposal while we are messing with it.
3669          */
3670
3671         vm_object_lock(object);
3672         vm_object_reference_locked(object);
3673         vm_object_paging_begin(object);
3674
3675         /*
3676          *      INVARIANTS (through entire routine):
3677          *
3678          *      1)      At all times, we must either have the object
3679          *              lock or a busy page in some object to prevent
3680          *              some other thread from trying to bring in
3681          *              the same page.
3682          *
3683          *      2)      Once we have a busy page, we must remove it from
3684          *              the pageout queues, so that the pageout daemon
3685          *              will not grab it away.
3686          *
3687          */
3688
3689         /*
3690          *      Look for page in top-level object.  If it's not there or
3691          *      there's something going on, give up.
3692          * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3693          * decrypt the page before wiring it down.
3694          */
3695         m = vm_page_lookup(object, offset);
3696         if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
3697             (m->unusual && ( m->error || m->restart || m->absent))) {
3698
3699                 GIVE_UP;
3700         }
3701         ASSERT_PAGE_DECRYPTED(m);
3702
3703         if (m->fictitious &&
3704             m->phys_page == vm_page_guard_addr) {
3705                 /*
3706                  * Guard pages are fictitious pages and are never
3707                  * entered into a pmap, so let's say it's been wired...
3708                  */
3709                 kr = KERN_SUCCESS;
3710                 goto done;
3711         }
3712
3713         /*
3714          *      Wire the page down now.  All bail outs beyond this
3715          *      point must unwire the page.
3716          */
3717
3718         vm_page_lockspin_queues();
3719         vm_page_wire(m);
3720         vm_page_unlock_queues();
3721
3722         /*
3723          *      Mark page busy for other threads.
3724          */
3725         assert(!m->busy);
3726         m->busy = TRUE;
3727         assert(!m->absent);
3728
3729         /*
3730          *      Give up if the page is being written and there's a copy object
3731          */
3732         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3733                 RELEASE_PAGE(m);
3734                 GIVE_UP;
3735         }
3736
3737         /*
3738          *      Put this page into the physical map.
3739          */
3740         type_of_fault = DBG_CACHE_HIT_FAULT;
3741         kr = vm_fault_enter(m,
3742                             pmap,
3743                             pmap_addr,
3744                             prot,
3745                             TRUE,
3746                             FALSE,
3747                             FALSE,
3748                             &type_of_fault);
3749
3750 done:
3751         /*
3752          *      Unlock everything, and return
3753          */
3754
3755         PAGE_WAKEUP_DONE(m);
3756         UNLOCK_AND_DEALLOCATE;
3757
3758         return kr;
3759
3760 }
3761
3762 /*
3763  *      Routine:        vm_fault_copy_cleanup
3764  *      Purpose:
3765  *              Release a page used by vm_fault_copy.
3766  */
3767
3768 void
3769 vm_fault_copy_cleanup(
3770         vm_page_t       page,
3771         vm_page_t       top_page)
3772 {
3773         vm_object_t     object = page->object;
3774
3775         vm_object_lock(object);
3776         PAGE_WAKEUP_DONE(page);
3777         vm_page_lockspin_queues();
3778         if (!page->active && !page->inactive && !page->throttled)
3779                 vm_page_activate(page);
3780         vm_page_unlock_queues();
3781         vm_fault_cleanup(object, top_page);
3782 }
3783
3784 void
3785 vm_fault_copy_dst_cleanup(
3786         vm_page_t       page)
3787 {
3788         vm_object_t     object;
3789
3790         if (page != VM_PAGE_NULL) {
3791                 object = page->object;
3792                 vm_object_lock(object);
3793                 vm_page_lockspin_queues();
3794                 vm_page_unwire(page);
3795                 vm_page_unlock_queues();
3796                 vm_object_paging_end(object);
3797                 vm_object_unlock(object);
3798         }
3799 }
3800
3801 /*
3802  *      Routine:        vm_fault_copy
3803  *
3804  *      Purpose:
3805  *              Copy pages from one virtual memory object to another --
3806  *              neither the source nor destination pages need be resident.
3807  *
3808  *              Before actually copying a page, the version associated with
3809  *              the destination address map wil be verified.
3810  *
3811  *      In/out conditions:
3812  *              The caller must hold a reference, but not a lock, to
3813  *              each of the source and destination objects and to the
3814  *              destination map.
3815  *
3816  *      Results:
3817  *              Returns KERN_SUCCESS if no errors were encountered in
3818  *              reading or writing the data.  Returns KERN_INTERRUPTED if
3819  *              the operation was interrupted (only possible if the
3820  *              "interruptible" argument is asserted).  Other return values
3821  *              indicate a permanent error in copying the data.
3822  *
3823  *              The actual amount of data copied will be returned in the
3824  *              "copy_size" argument.  In the event that the destination map
3825  *              verification failed, this amount may be less than the amount
3826  *              requested.
3827  */
3828 kern_return_t
3829 vm_fault_copy(
3830         vm_object_t             src_object,
3831         vm_object_offset_t      src_offset,
3832         vm_map_size_t           *copy_size,             /* INOUT */
3833         vm_object_t             dst_object,
3834         vm_object_offset_t      dst_offset,
3835         vm_map_t                dst_map,
3836         vm_map_version_t         *dst_version,
3837         int                     interruptible)
3838 {
3839         vm_page_t               result_page;
3840
3841         vm_page_t               src_page;
3842         vm_page_t               src_top_page;
3843         vm_prot_t               src_prot;
3844
3845         vm_page_t               dst_page;
3846         vm_page_t               dst_top_page;
3847         vm_prot_t               dst_prot;
3848
3849         vm_map_size_t           amount_left;
3850         vm_object_t             old_copy_object;
3851         kern_return_t           error = 0;
3852
3853         vm_map_size_t           part_size;
3854         struct vm_object_fault_info fault_info_src;
3855         struct vm_object_fault_info fault_info_dst;
3856
3857         /*
3858          * In order not to confuse the clustered pageins, align
3859          * the different offsets on a page boundary.
3860          */
3861
3862 #define RETURN(x)                                       \
3863         MACRO_BEGIN                                     \
3864         *copy_size -= amount_left;                      \
3865         MACRO_RETURN(x);                                \
3866         MACRO_END
3867
3868         amount_left = *copy_size;
3869
3870         fault_info_src.interruptible = interruptible;
3871         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
3872         fault_info_src.user_tag  = 0;
3873         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
3874         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
3875         fault_info_src.no_cache   = FALSE;
3876
3877         fault_info_dst.interruptible = interruptible;
3878         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
3879         fault_info_dst.user_tag  = 0;
3880         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
3881         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
3882         fault_info_dst.no_cache   = FALSE;
3883
3884         do { /* while (amount_left > 0) */
3885                 /*
3886                  * There may be a deadlock if both source and destination
3887                  * pages are the same. To avoid this deadlock, the copy must
3888                  * start by getting the destination page in order to apply
3889                  * COW semantics if any.
3890                  */
3891
3892         RetryDestinationFault: ;
3893
3894                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3895
3896                 vm_object_lock(dst_object);
3897                 vm_object_paging_begin(dst_object);
3898
3899                 fault_info_dst.cluster_size = amount_left;
3900
3901                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3902                 switch (vm_fault_page(dst_object,
3903                                       vm_object_trunc_page(dst_offset),
3904                                       VM_PROT_WRITE|VM_PROT_READ,
3905                                       FALSE,
3906                                       &dst_prot, &dst_page, &dst_top_page,
3907                                       (int *)0,
3908                                       &error,
3909                                       dst_map->no_zero_fill,
3910                                       FALSE, &fault_info_dst)) {
3911                 case VM_FAULT_SUCCESS:
3912                         break;
3913                 case VM_FAULT_RETRY:
3914                         goto RetryDestinationFault;
3915                 case VM_FAULT_MEMORY_SHORTAGE:
3916                         if (vm_page_wait(interruptible))
3917                                 goto RetryDestinationFault;
3918                         /* fall thru */
3919                 case VM_FAULT_INTERRUPTED:
3920                         RETURN(MACH_SEND_INTERRUPTED);
3921                 case VM_FAULT_MEMORY_ERROR:
3922                         if (error)
3923                                 return (error);
3924                         else
3925                                 return(KERN_MEMORY_ERROR);
3926                 }
3927                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3928
3929                 old_copy_object = dst_page->object->copy;
3930
3931                 /*
3932                  * There exists the possiblity that the source and
3933                  * destination page are the same.  But we can't
3934                  * easily determine that now.  If they are the
3935                  * same, the call to vm_fault_page() for the
3936                  * destination page will deadlock.  To prevent this we
3937                  * wire the page so we can drop busy without having
3938                  * the page daemon steal the page.  We clean up the
3939                  * top page  but keep the paging reference on the object
3940                  * holding the dest page so it doesn't go away.
3941                  */
3942
3943                 vm_page_lockspin_queues();
3944                 vm_page_wire(dst_page);
3945                 vm_page_unlock_queues();
3946                 PAGE_WAKEUP_DONE(dst_page);
3947                 vm_object_unlock(dst_page->object);
3948
3949                 if (dst_top_page != VM_PAGE_NULL) {
3950                         vm_object_lock(dst_object);
3951                         VM_PAGE_FREE(dst_top_page);
3952                         vm_object_paging_end(dst_object);
3953                         vm_object_unlock(dst_object);
3954                 }
3955
3956         RetrySourceFault: ;
3957
3958                 if (src_object == VM_OBJECT_NULL) {
3959                         /*
3960                          *      No source object.  We will just
3961                          *      zero-fill the page in dst_object.
3962                          */
3963                         src_page = VM_PAGE_NULL;
3964                         result_page = VM_PAGE_NULL;
3965                 } else {
3966                         vm_object_lock(src_object);
3967                         src_page = vm_page_lookup(src_object,
3968                                                   vm_object_trunc_page(src_offset));
3969                         if (src_page == dst_page) {
3970                                 src_prot = dst_prot;
3971                                 result_page = VM_PAGE_NULL;
3972                         } else {
3973                                 src_prot = VM_PROT_READ;
3974                                 vm_object_paging_begin(src_object);
3975
3976                                 fault_info_src.cluster_size = amount_left;
3977
3978                                 XPR(XPR_VM_FAULT,
3979                                         "vm_fault_copy(2) -> vm_fault_page\n",
3980                                         0,0,0,0,0);
3981                                 switch (vm_fault_page(
3982                                                 src_object,
3983                                                 vm_object_trunc_page(src_offset),
3984                                                 VM_PROT_READ, FALSE,
3985                                                 &src_prot,
3986                                                 &result_page, &src_top_page,
3987                                                 (int *)0, &error, FALSE,
3988                                                 FALSE, &fault_info_src)) {
3989
3990                                 case VM_FAULT_SUCCESS:
3991                                         break;
3992                                 case VM_FAULT_RETRY:
3993                                         goto RetrySourceFault;
3994                                 case VM_FAULT_MEMORY_SHORTAGE:
3995                                         if (vm_page_wait(interruptible))
3996                                                 goto RetrySourceFault;
3997                                         /* fall thru */
3998                                 case VM_FAULT_INTERRUPTED:
3999                                         vm_fault_copy_dst_cleanup(dst_page);
4000                                         RETURN(MACH_SEND_INTERRUPTED);
4001                                 case VM_FAULT_MEMORY_ERROR:
4002                                         vm_fault_copy_dst_cleanup(dst_page);
4003                                         if (error)
4004                                                 return (error);
4005                                         else
4006                                                 return(KERN_MEMORY_ERROR);
4007                                 }
4008
4009
4010                                 assert((src_top_page == VM_PAGE_NULL) ==
4011                                        (result_page->object == src_object));
4012                         }
4013                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
4014                         vm_object_unlock(result_page->object);
4015                 }
4016
4017                 if (!vm_map_verify(dst_map, dst_version)) {
4018                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
4019                                 vm_fault_copy_cleanup(result_page, src_top_page);
4020                         vm_fault_copy_dst_cleanup(dst_page);
4021                         break;
4022                 }
4023
4024                 vm_object_lock(dst_page->object);
4025
4026                 if (dst_page->object->copy != old_copy_object) {
4027                         vm_object_unlock(dst_page->object);
4028                         vm_map_verify_done(dst_map, dst_version);
4029                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
4030                                 vm_fault_copy_cleanup(result_page, src_top_page);
4031                         vm_fault_copy_dst_cleanup(dst_page);
4032                         break;
4033                 }
4034                 vm_object_unlock(dst_page->object);
4035
4036                 /*
4037                  *      Copy the page, and note that it is dirty
4038                  *      immediately.
4039                  */
4040
4041                 if (!page_aligned(src_offset) ||
4042                         !page_aligned(dst_offset) ||
4043                         !page_aligned(amount_left)) {
4044
4045                         vm_object_offset_t      src_po,
4046                                                 dst_po;
4047
4048                         src_po = src_offset - vm_object_trunc_page(src_offset);
4049                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
4050
4051                         if (dst_po > src_po) {
4052                                 part_size = PAGE_SIZE - dst_po;
4053                         } else {
4054                                 part_size = PAGE_SIZE - src_po;
4055                         }
4056                         if (part_size > (amount_left)){
4057                                 part_size = amount_left;
4058                         }
4059
4060                         if (result_page == VM_PAGE_NULL) {
4061                                 vm_page_part_zero_fill(dst_page,
4062                                                         dst_po, part_size);
4063                         } else {
4064                                 vm_page_part_copy(result_page, src_po,
4065                                         dst_page, dst_po, part_size);
4066                                 if(!dst_page->dirty){
4067                                         vm_object_lock(dst_object);
4068                                         dst_page->dirty = TRUE;
4069                                         vm_object_unlock(dst_page->object);
4070                                 }
4071
4072                         }
4073                 } else {
4074                         part_size = PAGE_SIZE;
4075
4076                         if (result_page == VM_PAGE_NULL)
4077                                 vm_page_zero_fill(dst_page);
4078                         else{
4079                                 vm_page_copy(result_page, dst_page);
4080                                 if(!dst_page->dirty){
4081                                         vm_object_lock(dst_object);
4082                                         dst_page->dirty = TRUE;
4083                                         vm_object_unlock(dst_page->object);
4084                                 }
4085                         }
4086
4087                 }
4088
4089                 /*
4090                  *      Unlock everything, and return
4091                  */
4092
4093                 vm_map_verify_done(dst_map, dst_version);
4094
4095                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4096                         vm_fault_copy_cleanup(result_page, src_top_page);
4097                 vm_fault_copy_dst_cleanup(dst_page);
4098
4099                 amount_left -= part_size;
4100                 src_offset += part_size;
4101                 dst_offset += part_size;
4102         } while (amount_left > 0);
4103
4104         RETURN(KERN_SUCCESS);
4105 #undef  RETURN
4106
4107         /*NOTREACHED*/
4108 }
4109
4110 #if     VM_FAULT_CLASSIFY
4111 /*
4112  *      Temporary statistics gathering support.
4113  */
4114
4115 /*
4116  *      Statistics arrays:
4117  */
4118 #define VM_FAULT_TYPES_MAX      5
4119 #define VM_FAULT_LEVEL_MAX      8
4120
4121 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4122
4123 #define VM_FAULT_TYPE_ZERO_FILL 0
4124 #define VM_FAULT_TYPE_MAP_IN    1
4125 #define VM_FAULT_TYPE_PAGER     2
4126 #define VM_FAULT_TYPE_COPY      3
4127 #define VM_FAULT_TYPE_OTHER     4
4128
4129
4130 void
4131 vm_fault_classify(vm_object_t           object,
4132                   vm_object_offset_t    offset,
4133                   vm_prot_t             fault_type)
4134 {
4135         int             type, level = 0;
4136         vm_page_t       m;
4137
4138         while (TRUE) {
4139                 m = vm_page_lookup(object, offset);
4140                 if (m != VM_PAGE_NULL) {
4141                         if (m->busy || m->error || m->restart || m->absent) {
4142                                 type = VM_FAULT_TYPE_OTHER;
4143                                 break;
4144                         }
4145                         if (((fault_type & VM_PROT_WRITE) == 0) ||
4146                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4147                                 type = VM_FAULT_TYPE_MAP_IN;
4148                                 break;
4149                         }
4150                         type = VM_FAULT_TYPE_COPY;
4151                         break;
4152                 }
4153                 else {
4154                         if (object->pager_created) {
4155                                 type = VM_FAULT_TYPE_PAGER;
4156                                 break;
4157                         }
4158                         if (object->shadow == VM_OBJECT_NULL) {
4159                                 type = VM_FAULT_TYPE_ZERO_FILL;
4160                                 break;
4161                         }
4162
4163                         offset += object->shadow_offset;
4164                         object = object->shadow;
4165                         level++;
4166                         continue;
4167                 }
4168         }
4169
4170         if (level > VM_FAULT_LEVEL_MAX)
4171                 level = VM_FAULT_LEVEL_MAX;
4172
4173         vm_fault_stats[type][level] += 1;
4174
4175         return;
4176 }
4177
4178 /* cleanup routine to call from debugger */
4179
4180 void
4181 vm_fault_classify_init(void)
4182 {
4183         int type, level;
4184
4185         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4186                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4187                         vm_fault_stats[type][level] = 0;
4188                 }
4189         }
4190
4191         return;
4192 }
4193 #endif  /* VM_FAULT_CLASSIFY */
4194
4195
4196 extern int cs_validation;
4197
4198 void
4199 vm_page_validate_cs_mapped(
4200         vm_page_t       page,
4201         const void      *kaddr)
4202 {
4203         vm_object_t             object;
4204         vm_object_offset_t      offset;
4205         kern_return_t           kr;
4206         memory_object_t         pager;
4207         void                    *blobs;
4208         boolean_t               validated, tainted;
4209
4210         assert(page->busy);
4211         vm_object_lock_assert_exclusive(page->object);
4212
4213         if (!cs_validation) {
4214                 return;
4215         }
4216
4217         if (page->wpmapped && !page->cs_tainted) {
4218                 /*
4219                  * This page was mapped for "write" access sometime in the
4220                  * past and could still be modifiable in the future.
4221                  * Consider it tainted.
4222                  * [ If the page was already found to be "tainted", no
4223                  * need to re-validate. ]
4224                  */
4225                 page->cs_validated = TRUE;
4226                 page->cs_tainted = TRUE;
4227                 if (cs_debug) {
4228                         printf("CODESIGNING: vm_page_validate_cs: "
4229                                "page %p obj %p off 0x%llx "
4230                                "was modified\n",
4231                                page, page->object, page->offset);
4232                 }
4233                 vm_cs_validated_dirtied++;
4234         }
4235
4236         if (page->cs_validated) {
4237                 return;
4238         }
4239
4240         vm_cs_validates++;
4241
4242         object = page->object;
4243         assert(object->code_signed);
4244         offset = page->offset;
4245
4246         if (!object->alive || object->terminating || object->pager == NULL) {
4247                 /*
4248                  * The object is terminating and we don't have its pager
4249                  * so we can't validate the data...
4250                  */
4251                 return;
4252         }
4253         /*
4254          * Since we get here to validate a page that was brought in by
4255          * the pager, we know that this pager is all setup and ready
4256          * by now.
4257          */
4258         assert(!object->internal);
4259         assert(object->pager != NULL);
4260         assert(object->pager_ready);
4261
4262         pager = object->pager;
4263
4264         kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
4265         if (kr != KERN_SUCCESS) {
4266                 blobs = NULL;
4267         }
4268
4269         /* verify the SHA1 hash for this page */
4270         validated = cs_validate_page(blobs,
4271                                      offset + object->paging_offset,
4272                                      (const void *)kaddr,
4273                                      &tainted);
4274
4275         page->cs_validated = validated;
4276         if (validated) {
4277                 page->cs_tainted = tainted;
4278         }
4279 }
4280
4281 void
4282 vm_page_validate_cs(
4283         vm_page_t       page)
4284 {
4285         vm_object_t             object;
4286         vm_object_offset_t      offset;
4287         vm_map_offset_t         koffset;
4288         vm_map_size_t           ksize;
4289         vm_offset_t             kaddr;
4290         kern_return_t           kr;
4291         boolean_t               busy_page;
4292
4293         vm_object_lock_assert_held(page->object);
4294
4295         if (!cs_validation) {
4296                 return;
4297         }
4298
4299         if (page->wpmapped && !page->cs_tainted) {
4300                 vm_object_lock_assert_exclusive(page->object);
4301
4302                 /*
4303                  * This page was mapped for "write" access sometime in the
4304                  * past and could still be modifiable in the future.
4305                  * Consider it tainted.
4306                  * [ If the page was already found to be "tainted", no
4307                  * need to re-validate. ]
4308                  */
4309                 page->cs_validated = TRUE;
4310                 page->cs_tainted = TRUE;
4311                 if (cs_debug) {
4312                         printf("CODESIGNING: vm_page_validate_cs: "
4313                                "page %p obj %p off 0x%llx "
4314                                "was modified\n",
4315                                page, page->object, page->offset);
4316                 }
4317                 vm_cs_validated_dirtied++;
4318         }
4319
4320         if (page->cs_validated) {
4321                 return;
4322         }
4323
4324         vm_object_lock_assert_exclusive(page->object);
4325
4326         object = page->object;
4327         assert(object->code_signed);
4328         offset = page->offset;
4329
4330         busy_page = page->busy;
4331         if (!busy_page) {
4332                 /* keep page busy while we map (and unlock) the VM object */
4333                 page->busy = TRUE;
4334         }
4335
4336         /*
4337          * Take a paging reference on the VM object
4338          * to protect it from collapse or bypass,
4339          * and keep it from disappearing too.
4340          */
4341         vm_object_paging_begin(object);
4342
4343         /* map the page in the kernel address space */
4344         koffset = 0;
4345         ksize = PAGE_SIZE_64;
4346         kr = vm_paging_map_object(&koffset,
4347                                   page,
4348                                   object,
4349                                   offset,
4350                                   &ksize,
4351                                   VM_PROT_READ,
4352                                   FALSE); /* can't unlock object ! */
4353         if (kr != KERN_SUCCESS) {
4354                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
4355         }
4356         kaddr = CAST_DOWN(vm_offset_t, koffset);
4357
4358         /* validate the mapped page */
4359         vm_page_validate_cs_mapped(page, (const void *) kaddr);
4360
4361         assert(page->busy);
4362         assert(object == page->object);
4363         vm_object_lock_assert_exclusive(object);
4364
4365         if (!busy_page) {
4366                 PAGE_WAKEUP_DONE(page);
4367         }
4368         if (koffset != 0) {
4369                 /* unmap the map from the kernel address space */
4370                 vm_paging_unmap_object(object, koffset, koffset + ksize);
4371                 koffset = 0;
4372                 ksize = 0;
4373                 kaddr = 0;
4374         }
4375         vm_object_paging_end(object);
4376 }