osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*
  23  * @OSF_COPYRIGHT@
  24  */
  25 /*
  26  * Mach Operating System
  27  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  28  * All Rights Reserved.
  29  *
  30  * Permission to use, copy, modify and distribute this software and its
  31  * documentation is hereby granted, provided that both the copyright
  32  * notice and this permission notice appear in all copies of the
  33  * software, derivative works or modified versions, and any portions
  34  * thereof, and that both notices appear in supporting documentation.
  35  *
  36  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  37  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  38  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  39  *
  40  * Carnegie Mellon requests users of this software to return to
  41  *
  42  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  43  *  School of Computer Science
  44  *  Carnegie Mellon University
  45  *  Pittsburgh PA 15213-3890
  46  *
  47  * any improvements or extensions that they make and grant Carnegie Mellon
  48  * the rights to redistribute these changes.
  49  */
  50 /*
  51  */
  52 /*
  53  *      File:   vm_fault.c
  54  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  55  *
  56  *      Page fault handling module.
  57  */
  58 #ifdef MACH_BSD
  59 /* remove after component interface available */
  60 extern int      vnode_pager_workaround;
  61 #endif
  62
  63 #include <mach_cluster_stats.h>
  64 #include <mach_pagemap.h>
  65 #include <mach_kdb.h>
  66
  67 #include <vm/vm_fault.h>
  68 #include <mach/kern_return.h>
  69 #include <mach/message.h>       /* for error codes */
  70 #include <kern/host_statistics.h>
  71 #include <kern/counters.h>
  72 #include <kern/task.h>
  73 #include <kern/thread.h>
  74 #include <kern/sched_prim.h>
  75 #include <kern/host.h>
  76 #include <kern/xpr.h>
  77 #include <vm/vm_map.h>
  78 #include <vm/vm_object.h>
  79 #include <vm/vm_page.h>
  80 #include <vm/pmap.h>
  81 #include <vm/vm_pageout.h>
  82 #include <mach/vm_param.h>
  83 #include <mach/vm_behavior.h>
  84 #include <mach/memory_object.h>
  85                                 /* For memory_object_data_{request,unlock} */
  86 #include <kern/mach_param.h>
  87 #include <kern/macro_help.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/misc_protos.h>
  90
  91 #include <sys/kdebug.h>
  92
  93 #define VM_FAULT_CLASSIFY       0
  94 #define VM_FAULT_STATIC_CONFIG  1
  95
  96 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
  97
  98 int             vm_object_absent_max = 50;
  99
 100 int             vm_fault_debug = 0;
 101 boolean_t       vm_page_deactivate_behind = TRUE;
 102
 103 vm_machine_attribute_val_t mv_cache_sync = MATTR_VAL_CACHE_SYNC;
 104
 105 #if     !VM_FAULT_STATIC_CONFIG
 106 boolean_t       vm_fault_dirty_handling = FALSE;
 107 boolean_t       vm_fault_interruptible = FALSE;
 108 boolean_t       software_reference_bits = TRUE;
 109 #endif
 110
 111 #if     MACH_KDB
 112 extern struct db_watchpoint *db_watchpoint_list;
 113 #endif  /* MACH_KDB */
 114
 115 /* Forward declarations of internal routines. */
 116 extern kern_return_t vm_fault_wire_fast(
 117                                 vm_map_t        map,
 118                                 vm_offset_t     va,
 119                                 vm_map_entry_t  entry,
 120                                 pmap_t          pmap);
 121
 122 extern void vm_fault_continue(void);
 123
 124 extern void vm_fault_copy_cleanup(
 125                                 vm_page_t       page,
 126                                 vm_page_t       top_page);
 127
 128 extern void vm_fault_copy_dst_cleanup(
 129                                 vm_page_t       page);
 130
 131 #if     VM_FAULT_CLASSIFY
 132 extern void vm_fault_classify(vm_object_t       object,
 133                           vm_object_offset_t    offset,
 134                           vm_prot_t             fault_type);
 135
 136 extern void vm_fault_classify_init(void);
 137 #endif
 138
 139 /*
 140  *      Routine:        vm_fault_init
 141  *      Purpose:
 142  *              Initialize our private data structures.
 143  */
 144 void
 145 vm_fault_init(void)
 146 {
 147 }
 148
 149 /*
 150  *      Routine:        vm_fault_cleanup
 151  *      Purpose:
 152  *              Clean up the result of vm_fault_page.
 153  *      Results:
 154  *              The paging reference for "object" is released.
 155  *              "object" is unlocked.
 156  *              If "top_page" is not null,  "top_page" is
 157  *              freed and the paging reference for the object
 158  *              containing it is released.
 159  *
 160  *      In/out conditions:
 161  *              "object" must be locked.
 162  */
 163 void
 164 vm_fault_cleanup(
 165         register vm_object_t    object,
 166         register vm_page_t      top_page)
 167 {
 168         vm_object_paging_end(object);
 169         vm_object_unlock(object);
 170
 171         if (top_page != VM_PAGE_NULL) {
 172             object = top_page->object;
 173             vm_object_lock(object);
 174             VM_PAGE_FREE(top_page);
 175             vm_object_paging_end(object);
 176             vm_object_unlock(object);
 177         }
 178 }
 179
 180 #if     MACH_CLUSTER_STATS
 181 #define MAXCLUSTERPAGES 16
 182 struct {
 183         unsigned long pages_in_cluster;
 184         unsigned long pages_at_higher_offsets;
 185         unsigned long pages_at_lower_offsets;
 186 } cluster_stats_in[MAXCLUSTERPAGES];
 187 #define CLUSTER_STAT(clause)    clause
 188 #define CLUSTER_STAT_HIGHER(x)  \
 189         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 190 #define CLUSTER_STAT_LOWER(x)   \
 191          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 192 #define CLUSTER_STAT_CLUSTER(x) \
 193         ((cluster_stats_in[(x)].pages_in_cluster)++)
 194 #else   /* MACH_CLUSTER_STATS */
 195 #define CLUSTER_STAT(clause)
 196 #endif  /* MACH_CLUSTER_STATS */
 197
 198 /* XXX - temporary */
 199 boolean_t vm_allow_clustered_pagein = FALSE;
 200 int vm_pagein_cluster_used = 0;
 201
 202 /*
 203  * Prepage default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 204  */
 205 int vm_default_ahead = 1;       /* Number of pages to prepage ahead */
 206 int vm_default_behind = 0;      /* Number of pages to prepage behind */
 207
 208 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 209
 210 /*
 211  *      Routine:        vm_fault_page
 212  *      Purpose:
 213  *              Find the resident page for the virtual memory
 214  *              specified by the given virtual memory object
 215  *              and offset.
 216  *      Additional arguments:
 217  *              The required permissions for the page is given
 218  *              in "fault_type".  Desired permissions are included
 219  *              in "protection".  The minimum and maximum valid offsets
 220  *              within the object for the relevant map entry are
 221  *              passed in "lo_offset" and "hi_offset" respectively and
 222  *              the expected page reference pattern is passed in "behavior".
 223  *              These three parameters are used to determine pagein cluster
 224  *              limits.
 225  *
 226  *              If the desired page is known to be resident (for
 227  *              example, because it was previously wired down), asserting
 228  *              the "unwiring" parameter will speed the search.
 229  *
 230  *              If the operation can be interrupted (by thread_abort
 231  *              or thread_terminate), then the "interruptible"
 232  *              parameter should be asserted.
 233  *
 234  *      Results:
 235  *              The page containing the proper data is returned
 236  *              in "result_page".
 237  *
 238  *      In/out conditions:
 239  *              The source object must be locked and referenced,
 240  *              and must donate one paging reference.  The reference
 241  *              is not affected.  The paging reference and lock are
 242  *              consumed.
 243  *
 244  *              If the call succeeds, the object in which "result_page"
 245  *              resides is left locked and holding a paging reference.
 246  *              If this is not the original object, a busy page in the
 247  *              original object is returned in "top_page", to prevent other
 248  *              callers from pursuing this same data, along with a paging
 249  *              reference for the original object.  The "top_page" should
 250  *              be destroyed when this guarantee is no longer required.
 251  *              The "result_page" is also left busy.  It is not removed
 252  *              from the pageout queues.
 253  */
 254
 255 vm_fault_return_t
 256 vm_fault_page(
 257         /* Arguments: */
 258         vm_object_t     first_object,   /* Object to begin search */
 259         vm_object_offset_t first_offset,        /* Offset into object */
 260         vm_prot_t       fault_type,     /* What access is requested */
 261         boolean_t       must_be_resident,/* Must page be resident? */
 262         int             interruptible,  /* how may fault be interrupted? */
 263         vm_object_offset_t lo_offset,   /* Map entry start */
 264         vm_object_offset_t hi_offset,   /* Map entry end */
 265         vm_behavior_t   behavior,       /* Page reference behavior */
 266         /* Modifies in place: */
 267         vm_prot_t       *protection,    /* Protection for mapping */
 268         /* Returns: */
 269         vm_page_t       *result_page,   /* Page found, if successful */
 270         vm_page_t       *top_page,      /* Page in top object, if
 271                                          * not result_page.  */
 272         int             *type_of_fault, /* if non-null, fill in with type of fault
 273                                          * COW, zero-fill, etc... returned in trace point */
 274         /* More arguments: */
 275         kern_return_t   *error_code,    /* code if page is in error */
 276         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 277         boolean_t       data_supply)    /* treat as data_supply if
 278                                          * it is a write fault and a full
 279                                          * page is provided */
 280 {
 281         register
 282         vm_page_t               m;
 283         register
 284         vm_object_t             object;
 285         register
 286         vm_object_offset_t      offset;
 287         vm_page_t               first_m;
 288         vm_object_t             next_object;
 289         vm_object_t             copy_object;
 290         boolean_t               look_for_page;
 291         vm_prot_t               access_required = fault_type;
 292         vm_prot_t               wants_copy_flag;
 293         vm_size_t               cluster_size, length;
 294         vm_object_offset_t      cluster_offset;
 295         vm_object_offset_t      cluster_start, cluster_end, paging_offset;
 296         vm_object_offset_t      align_offset;
 297         CLUSTER_STAT(int pages_at_higher_offsets;)
 298         CLUSTER_STAT(int pages_at_lower_offsets;)
 299         kern_return_t   wait_result;
 300         thread_t                cur_thread;
 301         boolean_t               interruptible_state;
 302
 303 #ifdef  MACH_BSD
 304         kern_return_t   vnode_pager_data_request(ipc_port_t,
 305                         ipc_port_t, vm_object_offset_t, vm_size_t, vm_prot_t);
 306 #endif
 307
 308 #if     MACH_PAGEMAP
 309 /*
 310  * MACH page map - an optional optimization where a bit map is maintained
 311  * by the VM subsystem for internal objects to indicate which pages of
 312  * the object currently reside on backing store.  This existence map
 313  * duplicates information maintained by the vnode pager.  It is
 314  * created at the time of the first pageout against the object, i.e.
 315  * at the same time pager for the object is created.  The optimization
 316  * is designed to eliminate pager interaction overhead, if it is
 317  * 'known' that the page does not exist on backing store.
 318  *
 319  * LOOK_FOR() evaluates to TRUE if the page specified by object/offset is
 320  * either marked as paged out in the existence map for the object or no
 321  * existence map exists for the object.  LOOK_FOR() is one of the
 322  * criteria in the decision to invoke the pager.   It is also used as one
 323  * of the criteria to terminate the scan for adjacent pages in a clustered
 324  * pagein operation.  Note that LOOK_FOR() always evaluates to TRUE for
 325  * permanent objects.  Note also that if the pager for an internal object
 326  * has not been created, the pager is not invoked regardless of the value
 327  * of LOOK_FOR() and that clustered pagein scans are only done on an object
 328  * for which a pager has been created.
 329  *
 330  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 331  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 332  * PAGED_OUT() is used to determine if a page has already been pushed
 333  * into a copy object in order to avoid a redundant page out operation.
 334  */
 335 #define LOOK_FOR(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 336                         != VM_EXTERNAL_STATE_ABSENT)
 337 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 338                         == VM_EXTERNAL_STATE_EXISTS)
 339 #else /* MACH_PAGEMAP */
 340 /*
 341  * If the MACH page map optimization is not enabled,
 342  * LOOK_FOR() always evaluates to TRUE.  The pager will always be
 343  * invoked to resolve missing pages in an object, assuming the pager
 344  * has been created for the object.  In a clustered page operation, the
 345  * absence of a page on backing backing store cannot be used to terminate
 346  * a scan for adjacent pages since that information is available only in
 347  * the pager.  Hence pages that may not be paged out are potentially
 348  * included in a clustered request.  The vnode pager is coded to deal
 349  * with any combination of absent/present pages in a clustered
 350  * pagein request.  PAGED_OUT() always evaluates to FALSE, i.e. the pager
 351  * will always be invoked to push a dirty page into a copy object assuming
 352  * a pager has been created.  If the page has already been pushed, the
 353  * pager will ingore the new request.
 354  */
 355 #define LOOK_FOR(o, f) TRUE
 356 #define PAGED_OUT(o, f) FALSE
 357 #endif /* MACH_PAGEMAP */
 358
 359 /*
 360  *      Recovery actions
 361  */
 362 #define PREPARE_RELEASE_PAGE(m)                         \
 363         MACRO_BEGIN                                     \
 364         vm_page_lock_queues();                          \
 365         MACRO_END
 366
 367 #define DO_RELEASE_PAGE(m)                              \
 368         MACRO_BEGIN                                     \
 369         PAGE_WAKEUP_DONE(m);                            \
 370         if (!m->active && !m->inactive)                 \
 371                 vm_page_activate(m);                    \
 372         vm_page_unlock_queues();                        \
 373         MACRO_END
 374
 375 #define RELEASE_PAGE(m)                                 \
 376         MACRO_BEGIN                                     \
 377         PREPARE_RELEASE_PAGE(m);                        \
 378         DO_RELEASE_PAGE(m);                             \
 379         MACRO_END
 380
 381 #if TRACEFAULTPAGE
 382         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 383 #endif
 384
 385
 386
 387 #if     !VM_FAULT_STATIC_CONFIG
 388         if (vm_fault_dirty_handling
 389 #if     MACH_KDB
 390                 /*
 391                  *      If there are watchpoints set, then
 392                  *      we don't want to give away write permission
 393                  *      on a read fault.  Make the task write fault,
 394                  *      so that the watchpoint code notices the access.
 395                  */
 396             || db_watchpoint_list
 397 #endif  /* MACH_KDB */
 398             ) {
 399                 /*
 400                  *      If we aren't asking for write permission,
 401                  *      then don't give it away.  We're using write
 402                  *      faults to set the dirty bit.
 403                  */
 404                 if (!(fault_type & VM_PROT_WRITE))
 405                         *protection &= ~VM_PROT_WRITE;
 406         }
 407
 408         if (!vm_fault_interruptible)
 409                 interruptible = THREAD_UNINT;
 410 #else   /* STATIC_CONFIG */
 411 #if     MACH_KDB
 412                 /*
 413                  *      If there are watchpoints set, then
 414                  *      we don't want to give away write permission
 415                  *      on a read fault.  Make the task write fault,
 416                  *      so that the watchpoint code notices the access.
 417                  */
 418             if (db_watchpoint_list) {
 419                 /*
 420                  *      If we aren't asking for write permission,
 421                  *      then don't give it away.  We're using write
 422                  *      faults to set the dirty bit.
 423                  */
 424                 if (!(fault_type & VM_PROT_WRITE))
 425                         *protection &= ~VM_PROT_WRITE;
 426         }
 427
 428 #endif  /* MACH_KDB */
 429 #endif  /* STATIC_CONFIG */
 430
 431         cur_thread = current_thread();
 432
 433         interruptible_state = cur_thread->interruptible;
 434         if (interruptible == THREAD_UNINT)
 435                 cur_thread->interruptible = FALSE;
 436
 437         /*
 438          *      INVARIANTS (through entire routine):
 439          *
 440          *      1)      At all times, we must either have the object
 441          *              lock or a busy page in some object to prevent
 442          *              some other thread from trying to bring in
 443          *              the same page.
 444          *
 445          *              Note that we cannot hold any locks during the
 446          *              pager access or when waiting for memory, so
 447          *              we use a busy page then.
 448          *
 449          *              Note also that we aren't as concerned about more than
 450          *              one thread attempting to memory_object_data_unlock
 451          *              the same page at once, so we don't hold the page
 452          *              as busy then, but do record the highest unlock
 453          *              value so far.  [Unlock requests may also be delivered
 454          *              out of order.]
 455          *
 456          *      2)      To prevent another thread from racing us down the
 457          *              shadow chain and entering a new page in the top
 458          *              object before we do, we must keep a busy page in
 459          *              the top object while following the shadow chain.
 460          *
 461          *      3)      We must increment paging_in_progress on any object
 462          *              for which we have a busy page
 463          *
 464          *      4)      We leave busy pages on the pageout queues.
 465          *              If the pageout daemon comes across a busy page,
 466          *              it will remove the page from the pageout queues.
 467          */
 468
 469         /*
 470          *      Search for the page at object/offset.
 471          */
 472
 473         object = first_object;
 474         offset = first_offset;
 475         first_m = VM_PAGE_NULL;
 476         access_required = fault_type;
 477
 478         XPR(XPR_VM_FAULT,
 479                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 480                 (integer_t)object, offset, fault_type, *protection, 0);
 481
 482         /*
 483          *      See whether this page is resident
 484          */
 485
 486         while (TRUE) {
 487 #if TRACEFAULTPAGE
 488                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 489 #endif
 490                 if (!object->alive) {
 491                         vm_fault_cleanup(object, first_m);
 492                         cur_thread->interruptible = interruptible_state;
 493                         return(VM_FAULT_MEMORY_ERROR);
 494                 }
 495                 m = vm_page_lookup(object, offset);
 496 #if TRACEFAULTPAGE
 497                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 498 #endif
 499                 if (m != VM_PAGE_NULL) {
 500                         /*
 501                          *      If the page was pre-paged as part of a
 502                          *      cluster, record the fact.
 503                          */
 504                         if (m->clustered) {
 505                                 vm_pagein_cluster_used++;
 506                                 m->clustered = FALSE;
 507                         }
 508
 509                         /*
 510                          *      If the page is being brought in,
 511                          *      wait for it and then retry.
 512                          *
 513                          *      A possible optimization: if the page
 514                          *      is known to be resident, we can ignore
 515                          *      pages that are absent (regardless of
 516                          *      whether they're busy).
 517                          */
 518
 519                         if (m->busy) {
 520 #if TRACEFAULTPAGE
 521                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 522 #endif
 523                                 PAGE_ASSERT_WAIT(m, interruptible);
 524                                 vm_object_unlock(object);
 525                                 XPR(XPR_VM_FAULT,
 526                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 527                                         (integer_t)object, offset,
 528                                         (integer_t)m, 0, 0);
 529                                 counter(c_vm_fault_page_block_busy_kernel++);
 530                                 wait_result = thread_block((void (*)(void))0);
 531
 532                                 vm_object_lock(object);
 533                                 if (wait_result != THREAD_AWAKENED) {
 534                                         vm_fault_cleanup(object, first_m);
 535                                         cur_thread->interruptible = interruptible_state;
 536                                         if (wait_result == THREAD_RESTART)
 537                                           {
 538                                                 return(VM_FAULT_RETRY);
 539                                           }
 540                                         else
 541                                           {
 542                                                 return(VM_FAULT_INTERRUPTED);
 543                                           }
 544                                 }
 545                                 continue;
 546                         }
 547
 548                         /*
 549                          *      If the page is in error, give up now.
 550                          */
 551
 552                         if (m->error) {
 553 #if TRACEFAULTPAGE
 554                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
 555 #endif
 556                                 if (error_code)
 557                                         *error_code = m->page_error;
 558                                 VM_PAGE_FREE(m);
 559                                 vm_fault_cleanup(object, first_m);
 560                                 cur_thread->interruptible = interruptible_state;
 561                                 return(VM_FAULT_MEMORY_ERROR);
 562                         }
 563
 564                         /*
 565                          *      If the pager wants us to restart
 566                          *      at the top of the chain,
 567                          *      typically because it has moved the
 568                          *      page to another pager, then do so.
 569                          */
 570
 571                         if (m->restart) {
 572 #if TRACEFAULTPAGE
 573                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 574 #endif
 575                                 VM_PAGE_FREE(m);
 576                                 vm_fault_cleanup(object, first_m);
 577                                 cur_thread->interruptible = interruptible_state;
 578                                 return(VM_FAULT_RETRY);
 579                         }
 580
 581                         /*
 582                          *      If the page isn't busy, but is absent,
 583                          *      then it was deemed "unavailable".
 584                          */
 585
 586                         if (m->absent) {
 587                                 /*
 588                                  * Remove the non-existent page (unless it's
 589                                  * in the top object) and move on down to the
 590                                  * next object (if there is one).
 591                                  */
 592 #if TRACEFAULTPAGE
 593                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
 594 #endif
 595
 596                                 next_object = object->shadow;
 597                                 if (next_object == VM_OBJECT_NULL) {
 598                                         vm_page_t real_m;
 599
 600                                         assert(!must_be_resident);
 601
 602                                         if (object->shadow_severed) {
 603                                                 vm_fault_cleanup(
 604                                                         object, first_m);
 605                                                 cur_thread->interruptible = interruptible_state;
 606                                                 return VM_FAULT_MEMORY_ERROR;
 607                                         }
 608
 609                                         /*
 610                                          * Absent page at bottom of shadow
 611                                          * chain; zero fill the page we left
 612                                          * busy in the first object, and flush
 613                                          * the absent page.  But first we
 614                                          * need to allocate a real page.
 615                                          */
 616                                         if (VM_PAGE_THROTTLED() ||
 617                                             (real_m = vm_page_grab()) == VM_PAGE_NULL) {
 618                                                 vm_fault_cleanup(object, first_m);
 619                                                 cur_thread->interruptible = interruptible_state;
 620                                                 return(VM_FAULT_MEMORY_SHORTAGE);
 621                                         }
 622
 623                                         XPR(XPR_VM_FAULT,
 624               "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
 625                                                 (integer_t)object, offset,
 626                                                 (integer_t)m,
 627                                                 (integer_t)first_object, 0);
 628                                         if (object != first_object) {
 629                                                 VM_PAGE_FREE(m);
 630                                                 vm_object_paging_end(object);
 631                                                 vm_object_unlock(object);
 632                                                 object = first_object;
 633                                                 offset = first_offset;
 634                                                 m = first_m;
 635                                                 first_m = VM_PAGE_NULL;
 636                                                 vm_object_lock(object);
 637                                         }
 638
 639                                         VM_PAGE_FREE(m);
 640                                         assert(real_m->busy);
 641                                         vm_page_insert(real_m, object, offset);
 642                                         m = real_m;
 643
 644                                         /*
 645                                          *  Drop the lock while zero filling
 646                                          *  page.  Then break because this
 647                                          *  is the page we wanted.  Checking
 648                                          *  the page lock is a waste of time;
 649                                          *  this page was either absent or
 650                                          *  newly allocated -- in both cases
 651                                          *  it can't be page locked by a pager.
 652                                          */
 653                                         if (!no_zero_fill) {
 654                                                 vm_object_unlock(object);
 655                                                 vm_page_zero_fill(m);
 656                                                 if (type_of_fault)
 657                                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
 658                                                 VM_STAT(zero_fill_count++);
 659                                                 vm_object_lock(object);
 660                                         }
 661                                         pmap_clear_modify(m->phys_addr);
 662                                         vm_page_lock_queues();
 663                                         VM_PAGE_QUEUES_REMOVE(m);
 664                                         queue_enter(&vm_page_queue_inactive,
 665                                                         m, vm_page_t, pageq);
 666                                         m->inactive = TRUE;
 667                                         vm_page_inactive_count++;
 668                                         vm_page_unlock_queues();
 669                                         break;
 670                                 } else {
 671                                         if (must_be_resident) {
 672                                                 vm_object_paging_end(object);
 673                                         } else if (object != first_object) {
 674                                                 vm_object_paging_end(object);
 675                                                 VM_PAGE_FREE(m);
 676                                         } else {
 677                                                 first_m = m;
 678                                                 m->absent = FALSE;
 679                                                 m->unusual = FALSE;
 680                                                 vm_object_absent_release(object);
 681                                                 m->busy = TRUE;
 682
 683                                                 vm_page_lock_queues();
 684                                                 VM_PAGE_QUEUES_REMOVE(m);
 685                                                 vm_page_unlock_queues();
 686                                         }
 687                                         XPR(XPR_VM_FAULT,
 688                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
 689                                                 (integer_t)object, offset,
 690                                                 (integer_t)next_object,
 691                                                 offset+object->shadow_offset,0);
 692                                         offset += object->shadow_offset;
 693                                         hi_offset += object->shadow_offset;
 694                                         lo_offset += object->shadow_offset;
 695                                         access_required = VM_PROT_READ;
 696                                         vm_object_lock(next_object);
 697                                         vm_object_unlock(object);
 698                                         object = next_object;
 699                                         vm_object_paging_begin(object);
 700                                         continue;
 701                                 }
 702                         }
 703
 704                         if ((m->cleaning)
 705                                 && ((object != first_object) ||
 706                                     (object->copy != VM_OBJECT_NULL))
 707                                 && (fault_type & VM_PROT_WRITE)) {
 708                                 /*
 709                                  * This is a copy-on-write fault that will
 710                                  * cause us to revoke access to this page, but
 711                                  * this page is in the process of being cleaned
 712                                  * in a clustered pageout. We must wait until
 713                                  * the cleaning operation completes before
 714                                  * revoking access to the original page,
 715                                  * otherwise we might attempt to remove a
 716                                  * wired mapping.
 717                                  */
 718 #if TRACEFAULTPAGE
 719                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
 720 #endif
 721                                 XPR(XPR_VM_FAULT,
 722                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
 723                                         (integer_t)object, offset,
 724                                         (integer_t)m, 0, 0);
 725                                 /* take an extra ref so that object won't die */
 726                                 assert(object->ref_count > 0);
 727                                 object->ref_count++;
 728                                 vm_object_res_reference(object);
 729                                 vm_fault_cleanup(object, first_m);
 730                                 counter(c_vm_fault_page_block_backoff_kernel++);
 731                                 vm_object_lock(object);
 732                                 assert(object->ref_count > 0);
 733                                 m = vm_page_lookup(object, offset);
 734                                 if (m != VM_PAGE_NULL && m->cleaning) {
 735                                         PAGE_ASSERT_WAIT(m, interruptible);
 736                                         vm_object_unlock(object);
 737                                         wait_result = thread_block((void (*)(void)) 0);
 738                                         vm_object_deallocate(object);
 739                                         goto backoff;
 740                                 } else {
 741                                         vm_object_unlock(object);
 742                                         vm_object_deallocate(object);
 743                                         cur_thread->interruptible = interruptible_state;
 744                                         return VM_FAULT_RETRY;
 745                                 }
 746                         }
 747
 748                         /*
 749                          *      If the desired access to this page has
 750                          *      been locked out, request that it be unlocked.
 751                          */
 752
 753                         if (access_required & m->page_lock) {
 754                                 if ((access_required & m->unlock_request) != access_required) {
 755                                         vm_prot_t       new_unlock_request;
 756                                         kern_return_t   rc;
 757
 758 #if TRACEFAULTPAGE
 759                                         dbgTrace(0xBEEF000A, (unsigned int) m, (unsigned int) object->pager_ready);     /* (TEST/DEBUG) */
 760 #endif
 761                                         if (!object->pager_ready) {
 762                                         XPR(XPR_VM_FAULT,
 763                                             "vm_f_page: ready wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 764                                                 access_required,
 765                                                 (integer_t)object, offset,
 766                                                 (integer_t)m, 0);
 767                                                 /* take an extra ref */
 768                                                 assert(object->ref_count > 0);
 769                                                 object->ref_count++;
 770                                                 vm_object_res_reference(object);
 771                                                 vm_fault_cleanup(object,
 772                                                                  first_m);
 773                                                 counter(c_vm_fault_page_block_backoff_kernel++);
 774                                                 vm_object_lock(object);
 775                                                 assert(object->ref_count > 0);
 776                                                 if (!object->pager_ready) {
 777                                                         vm_object_assert_wait(
 778                                                                 object,
 779                                                                 VM_OBJECT_EVENT_PAGER_READY,
 780                                                                 interruptible);
 781                                                         vm_object_unlock(object);
 782                                                         wait_result = thread_block((void (*)(void))0);
 783                                                         vm_object_deallocate(object);
 784                                                         goto backoff;
 785                                                 } else {
 786                                                         vm_object_unlock(object);
 787                                                         vm_object_deallocate(object);
 788                                                         cur_thread->interruptible = interruptible_state;
 789                                                         return VM_FAULT_RETRY;
 790                                                 }
 791                                         }
 792
 793                                         new_unlock_request = m->unlock_request =
 794                                                 (access_required | m->unlock_request);
 795                                         vm_object_unlock(object);
 796                                         XPR(XPR_VM_FAULT,
 797                                             "vm_f_page: unlock obj 0x%X, offset 0x%X, page 0x%X, unl_req %d\n",
 798                                         (integer_t)object, offset,
 799                                         (integer_t)m, new_unlock_request, 0);
 800                                         if ((rc = memory_object_data_unlock(
 801                                                 object->pager,
 802                                                 object->pager_request,
 803                                                 offset + object->paging_offset,
 804                                                 PAGE_SIZE,
 805                                                 new_unlock_request))
 806                                              != KERN_SUCCESS) {
 807                                                 if (vm_fault_debug)
 808                                                     printf("vm_fault: memory_object_data_unlock failed\n");
 809                                                 vm_object_lock(object);
 810                                                 vm_fault_cleanup(object, first_m);
 811                                                 cur_thread->interruptible = interruptible_state;
 812                                                 return((rc == MACH_SEND_INTERRUPTED) ?
 813                                                         VM_FAULT_INTERRUPTED :
 814                                                         VM_FAULT_MEMORY_ERROR);
 815                                         }
 816                                         vm_object_lock(object);
 817                                         continue;
 818                                 }
 819
 820                                 XPR(XPR_VM_FAULT,
 821         "vm_f_page: access wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 822                                         access_required, (integer_t)object,
 823                                         offset, (integer_t)m, 0);
 824                                 /* take an extra ref so object won't die */
 825                                 assert(object->ref_count > 0);
 826                                 object->ref_count++;
 827                                 vm_object_res_reference(object);
 828                                 vm_fault_cleanup(object, first_m);
 829                                 counter(c_vm_fault_page_block_backoff_kernel++);
 830                                 vm_object_lock(object);
 831                                 assert(object->ref_count > 0);
 832                                 m = vm_page_lookup(object, offset);
 833                                 if (m != VM_PAGE_NULL &&
 834                                     (access_required & m->page_lock) &&
 835                                     !((access_required & m->unlock_request) != access_required)) {
 836                                         PAGE_ASSERT_WAIT(m, interruptible);
 837                                         vm_object_unlock(object);
 838                                         wait_result = thread_block((void (*)(void)) 0);
 839                                         vm_object_deallocate(object);
 840                                         goto backoff;
 841                                 } else {
 842                                         vm_object_unlock(object);
 843                                         vm_object_deallocate(object);
 844                                         cur_thread->interruptible = interruptible_state;
 845                                         return VM_FAULT_RETRY;
 846                                 }
 847                         }
 848                         /*
 849                          *      We mark the page busy and leave it on
 850                          *      the pageout queues.  If the pageout
 851                          *      deamon comes across it, then it will
 852                          *      remove the page.
 853                          */
 854
 855 #if TRACEFAULTPAGE
 856                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 857 #endif
 858
 859 #if     !VM_FAULT_STATIC_CONFIG
 860                         if (!software_reference_bits) {
 861                                 vm_page_lock_queues();
 862                                 if (m->inactive)
 863                                         vm_stat.reactivations++;
 864
 865                                 VM_PAGE_QUEUES_REMOVE(m);
 866                                 vm_page_unlock_queues();
 867                         }
 868 #endif
 869                         XPR(XPR_VM_FAULT,
 870                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
 871                                 (integer_t)object, offset, (integer_t)m, 0, 0);
 872                         assert(!m->busy);
 873                         m->busy = TRUE;
 874                         assert(!m->absent);
 875                         break;
 876                 }
 877
 878                 look_for_page =
 879                         (object->pager_created) &&
 880                           LOOK_FOR(object, offset) &&
 881                             (!data_supply);
 882
 883 #if TRACEFAULTPAGE
 884                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
 885 #endif
 886                 if ((look_for_page || (object == first_object))
 887                                  && !must_be_resident) {
 888                         /*
 889                          *      Allocate a new page for this object/offset
 890                          *      pair.
 891                          */
 892
 893                         m = vm_page_grab_fictitious();
 894 #if TRACEFAULTPAGE
 895                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 896 #endif
 897                         if (m == VM_PAGE_NULL) {
 898                                 vm_fault_cleanup(object, first_m);
 899                                 cur_thread->interruptible = interruptible_state;
 900                                 return(VM_FAULT_FICTITIOUS_SHORTAGE);
 901                         }
 902                         vm_page_insert(m, object, offset);
 903                 }
 904
 905                 if (look_for_page && !must_be_resident) {
 906                         kern_return_t   rc;
 907
 908                         /*
 909                          *      If the memory manager is not ready, we
 910                          *      cannot make requests.
 911                          */
 912                         if (!object->pager_ready) {
 913 #if TRACEFAULTPAGE
 914                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 915 #endif
 916                                 VM_PAGE_FREE(m);
 917                                 XPR(XPR_VM_FAULT,
 918                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
 919                                         (integer_t)object, offset, 0, 0, 0);
 920                                 /* take an extra ref so object won't die */
 921                                 assert(object->ref_count > 0);
 922                                 object->ref_count++;
 923                                 vm_object_res_reference(object);
 924                                 vm_fault_cleanup(object, first_m);
 925                                 counter(c_vm_fault_page_block_backoff_kernel++);
 926                                 vm_object_lock(object);
 927                                 assert(object->ref_count > 0);
 928                                 if (!object->pager_ready) {
 929                                         vm_object_assert_wait(object,
 930                                                               VM_OBJECT_EVENT_PAGER_READY,
 931                                                               interruptible);
 932                                         vm_object_unlock(object);
 933                                         wait_result = thread_block((void (*)(void))0);
 934                                         vm_object_deallocate(object);
 935                                         goto backoff;
 936                                 } else {
 937                                         vm_object_unlock(object);
 938                                         vm_object_deallocate(object);
 939                                         cur_thread->interruptible = interruptible_state;
 940                                         return VM_FAULT_RETRY;
 941                                 }
 942                         }
 943
 944                         if (object->internal) {
 945                                 /*
 946                                  *      Requests to the default pager
 947                                  *      must reserve a real page in advance,
 948                                  *      because the pager's data-provided
 949                                  *      won't block for pages.  IMPORTANT:
 950                                  *      this acts as a throttling mechanism
 951                                  *      for data_requests to the default
 952                                  *      pager.
 953                                  */
 954
 955 #if TRACEFAULTPAGE
 956                                 dbgTrace(0xBEEF000F, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 957 #endif
 958                                 if (m->fictitious && !vm_page_convert(m)) {
 959                                         VM_PAGE_FREE(m);
 960                                         vm_fault_cleanup(object, first_m);
 961                                         cur_thread->interruptible = interruptible_state;
 962                                         return(VM_FAULT_MEMORY_SHORTAGE);
 963                                 }
 964                         } else if (object->absent_count >
 965                                                 vm_object_absent_max) {
 966                                 /*
 967                                  *      If there are too many outstanding page
 968                                  *      requests pending on this object, we
 969                                  *      wait for them to be resolved now.
 970                                  */
 971
 972 #if TRACEFAULTPAGE
 973                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 974 #endif
 975                                 VM_PAGE_FREE(m);
 976                                 /* take an extra ref so object won't die */
 977                                 assert(object->ref_count > 0);
 978                                 object->ref_count++;
 979                                 vm_object_res_reference(object);
 980                                 vm_fault_cleanup(object, first_m);
 981                                 counter(c_vm_fault_page_block_backoff_kernel++);
 982                                 vm_object_lock(object);
 983                                 assert(object->ref_count > 0);
 984                                 if (object->absent_count > vm_object_absent_max) {
 985                                         vm_object_absent_assert_wait(object,
 986                                                                      interruptible);
 987                                         vm_object_unlock(object);
 988                                         wait_result = thread_block((void (*)(void))0);
 989                                         vm_object_deallocate(object);
 990                                         goto backoff;
 991                                 } else {
 992                                         vm_object_unlock(object);
 993                                         vm_object_deallocate(object);
 994                                         cur_thread->interruptible = interruptible_state;
 995                                         return VM_FAULT_RETRY;
 996                                 }
 997                         }
 998
 999                         /*
1000                          *      Indicate that the page is waiting for data
1001                          *      from the memory manager.
1002                          */
1003
1004                         m->list_req_pending = TRUE;
1005                         m->absent = TRUE;
1006                         m->unusual = TRUE;
1007                         object->absent_count++;
1008
1009                         cluster_start = offset;
1010                         length = PAGE_SIZE;
1011                         cluster_size = object->cluster_size;
1012
1013                         /*
1014                          * Skip clustered pagein if it is globally disabled
1015                          * or random page reference behavior is expected
1016                          * for the address range containing the faulting
1017                          * address or the object paging block size is
1018                          * equal to the page size.
1019                          */
1020                         if (!vm_allow_clustered_pagein ||
1021                              behavior == VM_BEHAVIOR_RANDOM ||
1022                              cluster_size == PAGE_SIZE) {
1023                                 cluster_start = trunc_page_64(cluster_start);
1024                                 goto no_clustering;
1025                         }
1026
1027                         assert(offset >= lo_offset);
1028                         assert(offset < hi_offset);
1029                         assert(ALIGNED(object->paging_offset));
1030                         assert(cluster_size >= PAGE_SIZE);
1031
1032 #if TRACEFAULTPAGE
1033                         dbgTrace(0xBEEF0011, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1034 #endif
1035                         /*
1036                          * Decide whether to scan ahead or behind for
1037                          * additional pages contiguous to the faulted
1038                          * page in the same paging block.  The decision
1039                          * is based on system wide globals and the
1040                          * expected page reference behavior of the
1041                          * address range contained the faulting address.
1042                          * First calculate some constants.
1043                          */
1044                         paging_offset = offset + object->paging_offset;
1045                         cluster_offset = paging_offset & (cluster_size - 1);
1046                         align_offset = paging_offset&(PAGE_SIZE_64-1);
1047                         if (align_offset != 0) {
1048                                 cluster_offset = trunc_page_64(cluster_offset);
1049                         }
1050
1051 #define SPANS_CLUSTER(x) ((((x) - align_offset) & (vm_object_offset_t)(cluster_size - 1)) == 0)
1052
1053                         /*
1054                          * Backward scan only if reverse sequential
1055                          * behavior has been specified
1056                          */
1057                         CLUSTER_STAT(pages_at_lower_offsets = 0;)
1058                         if (((vm_default_behind != 0 &&
1059                              behavior == VM_BEHAVIOR_DEFAULT) ||
1060                              behavior == VM_BEHAVIOR_RSEQNTL) && offset) {
1061                             vm_object_offset_t cluster_bot;
1062
1063                             /*
1064                              * Calculate lower search boundary.
1065                              * Exclude pages that span a cluster boundary.
1066                              * Clip to start of map entry.
1067                              * For default page reference behavior, scan
1068                              * default pages behind.
1069                              */
1070                             cluster_bot = (offset > cluster_offset) ?
1071                                             offset - cluster_offset : offset;
1072                             if (align_offset != 0) {
1073                                 if ((cluster_bot < offset) &&
1074                                     SPANS_CLUSTER(cluster_bot)) {
1075                                         cluster_bot += PAGE_SIZE_64;
1076                                 }
1077                             }
1078                             if (behavior == VM_BEHAVIOR_DEFAULT) {
1079                                 vm_object_offset_t
1080                                         bot = (vm_object_offset_t)
1081                                                 (vm_default_behind * PAGE_SIZE);
1082
1083                                 if (cluster_bot < (offset - bot))
1084                                         cluster_bot = offset - bot;
1085                             }
1086                             if (lo_offset > cluster_bot)
1087                                 cluster_bot = lo_offset;
1088
1089                             for ( cluster_start = offset - PAGE_SIZE_64;
1090                                  (cluster_start >= cluster_bot) &&
1091                                  (cluster_start !=
1092                                         (align_offset - PAGE_SIZE_64));
1093                                   cluster_start -= PAGE_SIZE_64) {
1094                                 assert(cluster_size > PAGE_SIZE_64);
1095 retry_cluster_backw:
1096                                 if (!LOOK_FOR(object, cluster_start) ||
1097                                     vm_page_lookup(object, cluster_start)
1098                                                 != VM_PAGE_NULL) {
1099                                         break;
1100                                 }
1101                                 if (object->internal) {
1102                                         /*
1103                                          * need to acquire a real page in
1104                                          * advance because this acts as
1105                                          * a throttling mechanism for
1106                                          * data_requests to the default
1107                                          * pager.  If this fails, give up
1108                                          * trying to find any more pages
1109                                          * in the cluster and send off the
1110                                          * request for what we already have.
1111                                          */
1112                                         if ((m = vm_page_grab())
1113                                                         == VM_PAGE_NULL) {
1114                                             cluster_start += PAGE_SIZE_64;
1115                                             cluster_end = offset + PAGE_SIZE_64;
1116                                             goto give_up;
1117                                         }
1118                                 } else if ((m = vm_page_grab_fictitious())
1119                                                 == VM_PAGE_NULL) {
1120                                         vm_object_unlock(object);
1121                                         vm_page_more_fictitious();
1122                                         vm_object_lock(object);
1123                                         goto retry_cluster_backw;
1124                                 }
1125                                 m->absent = TRUE;
1126                                 m->unusual = TRUE;
1127                                 m->clustered = TRUE;
1128                                 m->list_req_pending = TRUE;
1129
1130                                 vm_page_insert(m, object, cluster_start);
1131                                 CLUSTER_STAT(pages_at_lower_offsets++;)
1132                                 object->absent_count++;
1133                             }
1134                             cluster_start += PAGE_SIZE_64;
1135                             assert(cluster_start >= cluster_bot);
1136                         }
1137                         assert(cluster_start <= offset);
1138
1139                         /*
1140                          * Forward scan if default or sequential behavior
1141                          * specified
1142                          */
1143                         CLUSTER_STAT(pages_at_higher_offsets = 0;)
1144                         if ((behavior == VM_BEHAVIOR_DEFAULT &&
1145                              vm_default_ahead != 0) ||
1146                              behavior == VM_BEHAVIOR_SEQUENTIAL) {
1147                             vm_object_offset_t cluster_top;
1148
1149                             /*
1150                              * Calculate upper search boundary.
1151                              * Exclude pages that span a cluster boundary.
1152                              * Clip to end of map entry.
1153                              * For default page reference behavior, scan
1154                              * default pages ahead.
1155                              */
1156                             cluster_top = (offset + cluster_size) -
1157                                           cluster_offset;
1158                             if (align_offset != 0) {
1159                                 if ((cluster_top > (offset + PAGE_SIZE_64)) &&
1160                                     SPANS_CLUSTER(cluster_top)) {
1161                                         cluster_top -= PAGE_SIZE_64;
1162                                 }
1163                             }
1164                             if (behavior == VM_BEHAVIOR_DEFAULT) {
1165                                 vm_object_offset_t top = (vm_object_offset_t)
1166                                      ((vm_default_ahead*PAGE_SIZE)+PAGE_SIZE);
1167
1168                                 if (cluster_top > (offset + top))
1169                                         cluster_top =  offset + top;
1170                             }
1171                             if (cluster_top > hi_offset)
1172                                         cluster_top = hi_offset;
1173
1174                             for (cluster_end = offset + PAGE_SIZE_64;
1175                                  cluster_end < cluster_top;
1176                                  cluster_end += PAGE_SIZE_64) {
1177                                 assert(cluster_size > PAGE_SIZE);
1178 retry_cluster_forw:
1179                                 if (!LOOK_FOR(object, cluster_end) ||
1180                                     vm_page_lookup(object, cluster_end)
1181                                                 != VM_PAGE_NULL) {
1182                                         break;
1183                                 }
1184                                 if (object->internal) {
1185                                         /*
1186                                          * need to acquire a real page in
1187                                          * advance because this acts as
1188                                          * a throttling mechanism for
1189                                          * data_requests to the default
1190                                          * pager.  If this fails, give up
1191                                          * trying to find any more pages
1192                                          * in the cluster and send off the
1193                                          * request for what we already have.
1194                                          */
1195                                         if ((m = vm_page_grab())
1196                                                         == VM_PAGE_NULL) {
1197                                             break;
1198                                         }
1199                                 } else if ((m = vm_page_grab_fictitious())
1200                                                 == VM_PAGE_NULL) {
1201                                     vm_object_unlock(object);
1202                                     vm_page_more_fictitious();
1203                                     vm_object_lock(object);
1204                                     goto retry_cluster_forw;
1205                                 }
1206                                 m->absent = TRUE;
1207                                 m->unusual = TRUE;
1208                                 m->clustered = TRUE;
1209                                 m->list_req_pending = TRUE;
1210
1211                                 vm_page_insert(m, object, cluster_end);
1212                                 CLUSTER_STAT(pages_at_higher_offsets++;)
1213                                 object->absent_count++;
1214                             }
1215                             assert(cluster_end <= cluster_top);
1216                         }
1217                         else {
1218                                 cluster_end = offset + PAGE_SIZE_64;
1219                         }
1220 give_up:
1221                         assert(cluster_end >= offset + PAGE_SIZE_64);
1222                         length = cluster_end - cluster_start;
1223
1224 #if     MACH_CLUSTER_STATS
1225                         CLUSTER_STAT_HIGHER(pages_at_higher_offsets);
1226                         CLUSTER_STAT_LOWER(pages_at_lower_offsets);
1227                         CLUSTER_STAT_CLUSTER(length/PAGE_SIZE);
1228 #endif  /* MACH_CLUSTER_STATS */
1229
1230 no_clustering:
1231 #if TRACEFAULTPAGE
1232                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1233 #endif
1234                         /*
1235                          *      We have a busy page, so we can
1236                          *      release the object lock.
1237                          */
1238                         vm_object_unlock(object);
1239
1240                         /*
1241                          *      Call the memory manager to retrieve the data.
1242                          */
1243
1244                         if (type_of_fault)
1245                                 *type_of_fault = DBG_PAGEIN_FAULT;
1246                         VM_STAT(pageins++);
1247                         current_task()->pageins++;
1248
1249                         /*
1250                          *      If this object uses a copy_call strategy,
1251                          *      and we are interested in a copy of this object
1252                          *      (having gotten here only by following a
1253                          *      shadow chain), then tell the memory manager
1254                          *      via a flag added to the desired_access
1255                          *      parameter, so that it can detect a race
1256                          *      between our walking down the shadow chain
1257                          *      and its pushing pages up into a copy of
1258                          *      the object that it manages.
1259                          */
1260
1261                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL &&
1262                             object != first_object) {
1263                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1264                         } else {
1265                                 wants_copy_flag = VM_PROT_NONE;
1266                         }
1267
1268                         XPR(XPR_VM_FAULT,
1269                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1270                                 (integer_t)object, offset, (integer_t)m,
1271                                 access_required | wants_copy_flag, 0);
1272
1273 #ifdef  MACH_BSD
1274                         if (((rpc_subsystem_t)pager_mux_hash_lookup(object->pager)) ==
1275                             ((rpc_subsystem_t) &vnode_pager_workaround)) {
1276                                 rc = vnode_pager_data_request(object->pager,
1277                                                               object->pager_request,
1278                                                               cluster_start + object->paging_offset,
1279                                                               length,
1280                                                               access_required | wants_copy_flag);
1281                         } else {
1282                                 rc = memory_object_data_request(object->pager,
1283                                                                 object->pager_request,
1284                                                                 cluster_start + object->paging_offset,
1285                                                                 length,
1286                                                                 access_required | wants_copy_flag);
1287                         }
1288 #else
1289                         rc = memory_object_data_request(object->pager,
1290                                         object->pager_request,
1291                                         cluster_start + object->paging_offset,
1292                                         length,
1293                                         access_required | wants_copy_flag);
1294
1295 #endif
1296
1297 #if TRACEFAULTPAGE
1298                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1299 #endif
1300                         if (rc != KERN_SUCCESS) {
1301                                 if (rc != MACH_SEND_INTERRUPTED
1302                                     && vm_fault_debug)
1303                                         printf("%s(0x%x, 0x%x, 0x%x, 0x%x, 0x%x) failed, rc=%d, object=0x%x\n",
1304                                                 "memory_object_data_request",
1305                                                 object->pager,
1306                                                 object->pager_request,
1307                                                 cluster_start + object->paging_offset,
1308                                                 length, access_required,
1309                                                 rc, object);
1310                                 /*
1311                                  *      Don't want to leave a busy page around,
1312                                  *      but the data request may have blocked,
1313                                  *      so check if it's still there and busy.
1314                                  */
1315                                 vm_object_lock(object);
1316                                 for (; length;
1317                                      length -= PAGE_SIZE,
1318                                      cluster_start += PAGE_SIZE_64) {
1319                                         vm_page_t p;
1320                                         if ((p = vm_page_lookup(object,
1321                                                                 cluster_start))
1322                                             && p->absent && p->busy
1323                                             && p != first_m) {
1324                                                 VM_PAGE_FREE(m);
1325                                         }
1326                                 }
1327                                 vm_fault_cleanup(object, first_m);
1328                                 cur_thread->interruptible = interruptible_state;
1329                                 return((rc == MACH_SEND_INTERRUPTED) ?
1330                                         VM_FAULT_INTERRUPTED :
1331                                         VM_FAULT_MEMORY_ERROR);
1332                         }
1333
1334                         /*
1335                          * Retry with same object/offset, since new data may
1336                          * be in a different page (i.e., m is meaningless at
1337                          * this point).
1338                          */
1339                         vm_object_lock(object);
1340                         if ((interruptible != THREAD_UNINT) &&
1341                             (current_thread()->state & TH_ABORT)) {
1342                                 vm_fault_cleanup(object, first_m);
1343                                 cur_thread->interruptible = interruptible_state;
1344                                 return(VM_FAULT_INTERRUPTED);
1345                         }
1346                         continue;
1347                 }
1348
1349                 /*
1350                  * The only case in which we get here is if
1351                  * object has no pager (or unwiring).  If the pager doesn't
1352                  * have the page this is handled in the m->absent case above
1353                  * (and if you change things here you should look above).
1354                  */
1355 #if TRACEFAULTPAGE
1356                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1357 #endif
1358                 if (object == first_object)
1359                         first_m = m;
1360                 else
1361                         assert(m == VM_PAGE_NULL);
1362
1363                 XPR(XPR_VM_FAULT,
1364                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1365                         (integer_t)object, offset, (integer_t)m,
1366                         (integer_t)object->shadow, 0);
1367                 /*
1368                  *      Move on to the next object.  Lock the next
1369                  *      object before unlocking the current one.
1370                  */
1371                 next_object = object->shadow;
1372                 if (next_object == VM_OBJECT_NULL) {
1373                         assert(!must_be_resident);
1374                         /*
1375                          *      If there's no object left, fill the page
1376                          *      in the top object with zeros.  But first we
1377                          *      need to allocate a real page.
1378                          */
1379
1380                         if (object != first_object) {
1381                                 vm_object_paging_end(object);
1382                                 vm_object_unlock(object);
1383
1384                                 object = first_object;
1385                                 offset = first_offset;
1386                                 vm_object_lock(object);
1387                         }
1388
1389                         m = first_m;
1390                         assert(m->object == object);
1391                         first_m = VM_PAGE_NULL;
1392
1393                         if (object->shadow_severed) {
1394                                 VM_PAGE_FREE(m);
1395                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1396                                 cur_thread->interruptible = interruptible_state;
1397                                 return VM_FAULT_MEMORY_ERROR;
1398                         }
1399
1400                         if (VM_PAGE_THROTTLED() ||
1401                             (m->fictitious && !vm_page_convert(m))) {
1402                                 VM_PAGE_FREE(m);
1403                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1404                                 cur_thread->interruptible = interruptible_state;
1405                                 return(VM_FAULT_MEMORY_SHORTAGE);
1406                         }
1407
1408                         if (!no_zero_fill) {
1409                                 vm_object_unlock(object);
1410                                 vm_page_zero_fill(m);
1411                                 if (type_of_fault)
1412                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
1413                                 VM_STAT(zero_fill_count++);
1414                                 vm_object_lock(object);
1415                         }
1416                         vm_page_lock_queues();
1417                         VM_PAGE_QUEUES_REMOVE(m);
1418                         queue_enter(&vm_page_queue_inactive,
1419                                                 m, vm_page_t, pageq);
1420                         m->inactive = TRUE;
1421                         vm_page_inactive_count++;
1422                         vm_page_unlock_queues();
1423                         pmap_clear_modify(m->phys_addr);
1424                         break;
1425                 }
1426                 else {
1427                         if ((object != first_object) || must_be_resident)
1428                                 vm_object_paging_end(object);
1429                         offset += object->shadow_offset;
1430                         hi_offset += object->shadow_offset;
1431                         lo_offset += object->shadow_offset;
1432                         access_required = VM_PROT_READ;
1433                         vm_object_lock(next_object);
1434                         vm_object_unlock(object);
1435                         object = next_object;
1436                         vm_object_paging_begin(object);
1437                 }
1438         }
1439
1440         /*
1441          *      PAGE HAS BEEN FOUND.
1442          *
1443          *      This page (m) is:
1444          *              busy, so that we can play with it;
1445          *              not absent, so that nobody else will fill it;
1446          *              possibly eligible for pageout;
1447          *
1448          *      The top-level page (first_m) is:
1449          *              VM_PAGE_NULL if the page was found in the
1450          *               top-level object;
1451          *              busy, not absent, and ineligible for pageout.
1452          *
1453          *      The current object (object) is locked.  A paging
1454          *      reference is held for the current and top-level
1455          *      objects.
1456          */
1457
1458 #if TRACEFAULTPAGE
1459         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1460 #endif
1461 #if     EXTRA_ASSERTIONS
1462         assert(m->busy && !m->absent);
1463         assert((first_m == VM_PAGE_NULL) ||
1464                 (first_m->busy && !first_m->absent &&
1465                  !first_m->active && !first_m->inactive));
1466 #endif  /* EXTRA_ASSERTIONS */
1467
1468         XPR(XPR_VM_FAULT,
1469        "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1470                 (integer_t)object, offset, (integer_t)m,
1471                 (integer_t)first_object, (integer_t)first_m);
1472         /*
1473          *      If the page is being written, but isn't
1474          *      already owned by the top-level object,
1475          *      we have to copy it into a new page owned
1476          *      by the top-level object.
1477          */
1478
1479         if (object != first_object) {
1480                 /*
1481                  *      We only really need to copy if we
1482                  *      want to write it.
1483                  */
1484
1485 #if TRACEFAULTPAGE
1486                         dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1487 #endif
1488                 if (fault_type & VM_PROT_WRITE) {
1489                         vm_page_t copy_m;
1490
1491                         assert(!must_be_resident);
1492
1493                         /*
1494                          *      If we try to collapse first_object at this
1495                          *      point, we may deadlock when we try to get
1496                          *      the lock on an intermediate object (since we
1497                          *      have the bottom object locked).  We can't
1498                          *      unlock the bottom object, because the page
1499                          *      we found may move (by collapse) if we do.
1500                          *
1501                          *      Instead, we first copy the page.  Then, when
1502                          *      we have no more use for the bottom object,
1503                          *      we unlock it and try to collapse.
1504                          *
1505                          *      Note that we copy the page even if we didn't
1506                          *      need to... that's the breaks.
1507                          */
1508
1509                         /*
1510                          *      Allocate a page for the copy
1511                          */
1512                         copy_m = vm_page_grab();
1513                         if (copy_m == VM_PAGE_NULL) {
1514                                 RELEASE_PAGE(m);
1515                                 vm_fault_cleanup(object, first_m);
1516                                 cur_thread->interruptible = interruptible_state;
1517                                 return(VM_FAULT_MEMORY_SHORTAGE);
1518                         }
1519
1520
1521                         XPR(XPR_VM_FAULT,
1522                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1523                                 (integer_t)object, offset,
1524                                 (integer_t)m, (integer_t)copy_m, 0);
1525                         vm_page_copy(m, copy_m);
1526
1527                         /*
1528                          *      If another map is truly sharing this
1529                          *      page with us, we have to flush all
1530                          *      uses of the original page, since we
1531                          *      can't distinguish those which want the
1532                          *      original from those which need the
1533                          *      new copy.
1534                          *
1535                          *      XXXO If we know that only one map has
1536                          *      access to this page, then we could
1537                          *      avoid the pmap_page_protect() call.
1538                          */
1539
1540                         vm_page_lock_queues();
1541                         assert(!m->cleaning);
1542                         pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1543                         vm_page_deactivate(m);
1544                         copy_m->dirty = TRUE;
1545                         /*
1546                          * Setting reference here prevents this fault from
1547                          * being counted as a (per-thread) reactivate as well
1548                          * as a copy-on-write.
1549                          */
1550                         first_m->reference = TRUE;
1551                         vm_page_unlock_queues();
1552
1553                         /*
1554                          *      We no longer need the old page or object.
1555                          */
1556
1557                         PAGE_WAKEUP_DONE(m);
1558                         vm_object_paging_end(object);
1559                         vm_object_unlock(object);
1560
1561                         if (type_of_fault)
1562                                 *type_of_fault = DBG_COW_FAULT;
1563                         VM_STAT(cow_faults++);
1564                         current_task()->cow_faults++;
1565                         object = first_object;
1566                         offset = first_offset;
1567
1568                         vm_object_lock(object);
1569                         VM_PAGE_FREE(first_m);
1570                         first_m = VM_PAGE_NULL;
1571                         assert(copy_m->busy);
1572                         vm_page_insert(copy_m, object, offset);
1573                         m = copy_m;
1574
1575                         /*
1576                          *      Now that we've gotten the copy out of the
1577                          *      way, let's try to collapse the top object.
1578                          *      But we have to play ugly games with
1579                          *      paging_in_progress to do that...
1580                          */
1581
1582                         vm_object_paging_end(object);
1583                         vm_object_collapse(object);
1584                         vm_object_paging_begin(object);
1585
1586                 }
1587                 else {
1588                         *protection &= (~VM_PROT_WRITE);
1589                 }
1590         }
1591
1592         /*
1593          *      Now check whether the page needs to be pushed into the
1594          *      copy object.  The use of asymmetric copy on write for
1595          *      shared temporary objects means that we may do two copies to
1596          *      satisfy the fault; one above to get the page from a
1597          *      shadowed object, and one here to push it into the copy.
1598          */
1599
1600         while (first_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
1601                (copy_object = first_object->copy) != VM_OBJECT_NULL) {
1602                 vm_object_offset_t      copy_offset;
1603                 vm_page_t               copy_m;
1604
1605 #if TRACEFAULTPAGE
1606                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1607 #endif
1608                 /*
1609                  *      If the page is being written, but hasn't been
1610                  *      copied to the copy-object, we have to copy it there.
1611                  */
1612
1613                 if ((fault_type & VM_PROT_WRITE) == 0) {
1614                         *protection &= ~VM_PROT_WRITE;
1615                         break;
1616                 }
1617
1618                 /*
1619                  *      If the page was guaranteed to be resident,
1620                  *      we must have already performed the copy.
1621                  */
1622
1623                 if (must_be_resident)
1624                         break;
1625
1626                 /*
1627                  *      Try to get the lock on the copy_object.
1628                  */
1629                 if (!vm_object_lock_try(copy_object)) {
1630                         vm_object_unlock(object);
1631
1632                         mutex_pause();  /* wait a bit */
1633
1634                         vm_object_lock(object);
1635                         continue;
1636                 }
1637
1638                 /*
1639                  *      Make another reference to the copy-object,
1640                  *      to keep it from disappearing during the
1641                  *      copy.
1642                  */
1643                 assert(copy_object->ref_count > 0);
1644                 copy_object->ref_count++;
1645                 VM_OBJ_RES_INCR(copy_object);
1646
1647                 /*
1648                  *      Does the page exist in the copy?
1649                  */
1650                 copy_offset = first_offset - copy_object->shadow_offset;
1651                 if (copy_object->size <= copy_offset)
1652                         /*
1653                          * Copy object doesn't cover this page -- do nothing.
1654                          */
1655                         ;
1656                 else if ((copy_m =
1657                         vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1658                         /* Page currently exists in the copy object */
1659                         if (copy_m->busy) {
1660                                 /*
1661                                  *      If the page is being brought
1662                                  *      in, wait for it and then retry.
1663                                  */
1664                                 RELEASE_PAGE(m);
1665                                 /* take an extra ref so object won't die */
1666                                 assert(copy_object->ref_count > 0);
1667                                 copy_object->ref_count++;
1668                                 vm_object_res_reference(copy_object);
1669                                 vm_object_unlock(copy_object);
1670                                 vm_fault_cleanup(object, first_m);
1671                                 counter(c_vm_fault_page_block_backoff_kernel++);
1672                                 vm_object_lock(copy_object);
1673                                 assert(copy_object->ref_count > 0);
1674                                 VM_OBJ_RES_DECR(copy_object);
1675                                 copy_object->ref_count--;
1676                                 assert(copy_object->ref_count > 0);
1677                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1678                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1679                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1680                                         vm_object_unlock(copy_object);
1681                                         wait_result = thread_block((void (*)(void))0);
1682                                         vm_object_deallocate(copy_object);
1683                                         goto backoff;
1684                                 } else {
1685                                         vm_object_unlock(copy_object);
1686                                         vm_object_deallocate(copy_object);
1687                                         cur_thread->interruptible = interruptible_state;
1688                                         return VM_FAULT_RETRY;
1689                                 }
1690                         }
1691                 }
1692                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1693                         /*
1694                          * If PAGED_OUT is TRUE, then the page used to exist
1695                          * in the copy-object, and has already been paged out.
1696                          * We don't need to repeat this. If PAGED_OUT is
1697                          * FALSE, then either we don't know (!pager_created,
1698                          * for example) or it hasn't been paged out.
1699                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1700                          * We must copy the page to the copy object.
1701                          */
1702
1703                         /*
1704                          *      Allocate a page for the copy
1705                          */
1706                         copy_m = vm_page_alloc(copy_object, copy_offset);
1707                         if (copy_m == VM_PAGE_NULL) {
1708                                 RELEASE_PAGE(m);
1709                                 VM_OBJ_RES_DECR(copy_object);
1710                                 copy_object->ref_count--;
1711                                 assert(copy_object->ref_count > 0);
1712                                 vm_object_unlock(copy_object);
1713                                 vm_fault_cleanup(object, first_m);
1714                                 cur_thread->interruptible = interruptible_state;
1715                                 return(VM_FAULT_MEMORY_SHORTAGE);
1716                         }
1717
1718                         /*
1719                          *      Must copy page into copy-object.
1720                          */
1721
1722                         vm_page_copy(m, copy_m);
1723
1724                         /*
1725                          *      If the old page was in use by any users
1726                          *      of the copy-object, it must be removed
1727                          *      from all pmaps.  (We can't know which
1728                          *      pmaps use it.)
1729                          */
1730
1731                         vm_page_lock_queues();
1732                         assert(!m->cleaning);
1733                         pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1734                         copy_m->dirty = TRUE;
1735                         vm_page_unlock_queues();
1736
1737                         /*
1738                          *      If there's a pager, then immediately
1739                          *      page out this page, using the "initialize"
1740                          *      option.  Else, we use the copy.
1741                          */
1742
1743                         if
1744 #if     MACH_PAGEMAP
1745                           ((!copy_object->pager_created) ||
1746                                 vm_external_state_get(
1747                                         copy_object->existence_map, copy_offset)
1748                                 == VM_EXTERNAL_STATE_ABSENT)
1749 #else
1750                           (!copy_object->pager_created)
1751 #endif
1752                                 {
1753                                 vm_page_lock_queues();
1754                                 vm_page_activate(copy_m);
1755                                 vm_page_unlock_queues();
1756                                 PAGE_WAKEUP_DONE(copy_m);
1757                         }
1758                         else {
1759                                 assert(copy_m->busy == TRUE);
1760
1761                                 /*
1762                                  *      The page is already ready for pageout:
1763                                  *      not on pageout queues and busy.
1764                                  *      Unlock everything except the
1765                                  *      copy_object itself.
1766                                  */
1767
1768                                 vm_object_unlock(object);
1769
1770                                 /*
1771                                  *      Write the page to the copy-object,
1772                                  *      flushing it from the kernel.
1773                                  */
1774
1775                                 vm_pageout_initialize_page(copy_m);
1776
1777                                 /*
1778                                  *      Since the pageout may have
1779                                  *      temporarily dropped the
1780                                  *      copy_object's lock, we
1781                                  *      check whether we'll have
1782                                  *      to deallocate the hard way.
1783                                  */
1784
1785                                 if ((copy_object->shadow != object) ||
1786                                     (copy_object->ref_count == 1)) {
1787                                         vm_object_unlock(copy_object);
1788                                         vm_object_deallocate(copy_object);
1789                                         vm_object_lock(object);
1790                                         continue;
1791                                 }
1792
1793                                 /*
1794                                  *      Pick back up the old object's
1795                                  *      lock.  [It is safe to do so,
1796                                  *      since it must be deeper in the
1797                                  *      object tree.]
1798                                  */
1799
1800                                 vm_object_lock(object);
1801                         }
1802
1803                         /*
1804                          *      Because we're pushing a page upward
1805                          *      in the object tree, we must restart
1806                          *      any faults that are waiting here.
1807                          *      [Note that this is an expansion of
1808                          *      PAGE_WAKEUP that uses the THREAD_RESTART
1809                          *      wait result].  Can't turn off the page's
1810                          *      busy bit because we're not done with it.
1811                          */
1812
1813                         if (m->wanted) {
1814                                 m->wanted = FALSE;
1815                                 thread_wakeup_with_result((event_t) m,
1816                                         THREAD_RESTART);
1817                         }
1818                 }
1819
1820                 /*
1821                  *      The reference count on copy_object must be
1822                  *      at least 2: one for our extra reference,
1823                  *      and at least one from the outside world
1824                  *      (we checked that when we last locked
1825                  *      copy_object).
1826                  */
1827                 copy_object->ref_count--;
1828                 assert(copy_object->ref_count > 0);
1829                 VM_OBJ_RES_DECR(copy_object);
1830                 vm_object_unlock(copy_object);
1831
1832                 break;
1833         }
1834
1835         *result_page = m;
1836         *top_page = first_m;
1837
1838         XPR(XPR_VM_FAULT,
1839                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1840                 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1841         /*
1842          *      If the page can be written, assume that it will be.
1843          *      [Earlier, we restrict the permission to allow write
1844          *      access only if the fault so required, so we don't
1845          *      mark read-only data as dirty.]
1846          */
1847
1848 #if     !VM_FAULT_STATIC_CONFIG
1849         if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE))
1850                 m->dirty = TRUE;
1851 #endif
1852 #if TRACEFAULTPAGE
1853         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_page_deactivate_behind);  /* (TEST/DEBUG) */
1854 #endif
1855         if (vm_page_deactivate_behind) {
1856                 if (offset && /* don't underflow */
1857                         (object->last_alloc == (offset - PAGE_SIZE_64))) {
1858                         m = vm_page_lookup(object, object->last_alloc);
1859                         if ((m != VM_PAGE_NULL) && !m->busy) {
1860                                 vm_page_lock_queues();
1861                                 vm_page_deactivate(m);
1862                                 vm_page_unlock_queues();
1863                         }
1864 #if TRACEFAULTPAGE
1865                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1866 #endif
1867                 }
1868                 object->last_alloc = offset;
1869         }
1870 #if TRACEFAULTPAGE
1871         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
1872 #endif
1873         cur_thread->interruptible = interruptible_state;
1874         return(VM_FAULT_SUCCESS);
1875
1876 #if 0
1877     block_and_backoff:
1878         vm_fault_cleanup(object, first_m);
1879
1880         counter(c_vm_fault_page_block_backoff_kernel++);
1881         thread_block((void (*)(void))0);
1882 #endif
1883
1884     backoff:
1885         cur_thread->interruptible = interruptible_state;
1886         if (wait_result == THREAD_INTERRUPTED)
1887                 return VM_FAULT_INTERRUPTED;
1888         return VM_FAULT_RETRY;
1889
1890 #undef  RELEASE_PAGE
1891 }
1892
1893 /*
1894  *      Routine:        vm_fault
1895  *      Purpose:
1896  *              Handle page faults, including pseudo-faults
1897  *              used to change the wiring status of pages.
1898  *      Returns:
1899  *              Explicit continuations have been removed.
1900  *      Implementation:
1901  *              vm_fault and vm_fault_page save mucho state
1902  *              in the moral equivalent of a closure.  The state
1903  *              structure is allocated when first entering vm_fault
1904  *              and deallocated when leaving vm_fault.
1905  */
1906
1907 kern_return_t
1908 vm_fault(
1909         vm_map_t        map,
1910         vm_offset_t     vaddr,
1911         vm_prot_t       fault_type,
1912         boolean_t       change_wiring,
1913         int             interruptible)
1914 {
1915         vm_map_version_t        version;        /* Map version for verificiation */
1916         boolean_t               wired;          /* Should mapping be wired down? */
1917         vm_object_t             object;         /* Top-level object */
1918         vm_object_offset_t      offset;         /* Top-level offset */
1919         vm_prot_t               prot;           /* Protection for mapping */
1920         vm_behavior_t           behavior;       /* Expected paging behavior */
1921         vm_object_offset_t      lo_offset, hi_offset;
1922         vm_object_t             old_copy_object; /* Saved copy object */
1923         vm_page_t               result_page;    /* Result of vm_fault_page */
1924         vm_page_t               top_page;       /* Placeholder page */
1925         kern_return_t           kr;
1926
1927         register
1928         vm_page_t               m;      /* Fast access to result_page */
1929         kern_return_t           error_code;     /* page error reasons */
1930         register
1931         vm_object_t             cur_object;
1932         register
1933         vm_object_offset_t      cur_offset;
1934         vm_page_t               cur_m;
1935         vm_object_t             new_object;
1936         int                     type_of_fault;
1937         vm_map_t                pmap_map = map;
1938         vm_map_t                original_map = map;
1939         pmap_t                  pmap = NULL;
1940         boolean_t               funnel_set = FALSE;
1941         funnel_t                *curflock;
1942         thread_t                cur_thread;
1943         boolean_t               interruptible_state;
1944
1945
1946         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_START,
1947                               vaddr,
1948                               0,
1949                               0,
1950                               0,
1951                               0);
1952
1953         cur_thread = current_thread();
1954
1955         interruptible_state = cur_thread->interruptible;
1956         if (interruptible == THREAD_UNINT)
1957                 cur_thread->interruptible = FALSE;
1958
1959         /*
1960          * assume we will hit a page in the cache
1961          * otherwise, explicitly override with
1962          * the real fault type once we determine it
1963          */
1964         type_of_fault = DBG_CACHE_HIT_FAULT;
1965
1966         VM_STAT(faults++);
1967         current_task()->faults++;
1968
1969         /*
1970          * drop funnel if it is already held. Then restore while returning
1971          */
1972         if ((cur_thread->funnel_state & TH_FN_OWNED) == TH_FN_OWNED) {
1973                 funnel_set = TRUE;
1974                 curflock = cur_thread->funnel_lock;
1975                 thread_funnel_set( curflock , FALSE);
1976         }
1977
1978     RetryFault: ;
1979
1980         /*
1981          *      Find the backing store object and offset into
1982          *      it to begin the search.
1983          */
1984         map = original_map;
1985         vm_map_lock_read(map);
1986         kr = vm_map_lookup_locked(&map, vaddr, fault_type, &version,
1987                                 &object, &offset,
1988                                 &prot, &wired,
1989                                 &behavior, &lo_offset, &hi_offset, &pmap_map);
1990
1991         pmap = pmap_map->pmap;
1992
1993         if (kr != KERN_SUCCESS) {
1994                 vm_map_unlock_read(map);
1995                 goto done;
1996         }
1997
1998         /*
1999          *      If the page is wired, we must fault for the current protection
2000          *      value, to avoid further faults.
2001          */
2002
2003         if (wired)
2004                 fault_type = prot | VM_PROT_WRITE;
2005
2006 #if     VM_FAULT_CLASSIFY
2007         /*
2008          *      Temporary data gathering code
2009          */
2010         vm_fault_classify(object, offset, fault_type);
2011 #endif
2012         /*
2013          *      Fast fault code.  The basic idea is to do as much as
2014          *      possible while holding the map lock and object locks.
2015          *      Busy pages are not used until the object lock has to
2016          *      be dropped to do something (copy, zero fill, pmap enter).
2017          *      Similarly, paging references aren't acquired until that
2018          *      point, and object references aren't used.
2019          *
2020          *      If we can figure out what to do
2021          *      (zero fill, copy on write, pmap enter) while holding
2022          *      the locks, then it gets done.  Otherwise, we give up,
2023          *      and use the original fault path (which doesn't hold
2024          *      the map lock, and relies on busy pages).
2025          *      The give up cases include:
2026          *              - Have to talk to pager.
2027          *              - Page is busy, absent or in error.
2028          *              - Pager has locked out desired access.
2029          *              - Fault needs to be restarted.
2030          *              - Have to push page into copy object.
2031          *
2032          *      The code is an infinite loop that moves one level down
2033          *      the shadow chain each time.  cur_object and cur_offset
2034          *      refer to the current object being examined. object and offset
2035          *      are the original object from the map.  The loop is at the
2036          *      top level if and only if object and cur_object are the same.
2037          *
2038          *      Invariants:  Map lock is held throughout.  Lock is held on
2039          *              original object and cur_object (if different) when
2040          *              continuing or exiting loop.
2041          *
2042          */
2043
2044
2045         /*
2046          *      If this page is to be inserted in a copy delay object
2047          *      for writing, and if the object has a copy, then the
2048          *      copy delay strategy is implemented in the slow fault page.
2049          */
2050         if (object->copy_strategy != MEMORY_OBJECT_COPY_DELAY ||
2051             object->copy == VM_OBJECT_NULL ||
2052             (fault_type & VM_PROT_WRITE) == 0) {
2053         cur_object = object;
2054         cur_offset = offset;
2055
2056         while (TRUE) {
2057                 m = vm_page_lookup(cur_object, cur_offset);
2058                 if (m != VM_PAGE_NULL) {
2059                         if (m->busy)
2060                                 break;
2061
2062                         if (m->unusual && (m->error || m->restart ||
2063                             m->absent || (fault_type & m->page_lock))) {
2064
2065                         /*
2066                                  *      Unusual case. Give up.
2067                                  */
2068                                 break;
2069                         }
2070
2071                         /*
2072                          *      Two cases of map in faults:
2073                          *          - At top level w/o copy object.
2074                          *          - Read fault anywhere.
2075                          *              --> must disallow write.
2076                          */
2077
2078                         if (object == cur_object &&
2079                             object->copy == VM_OBJECT_NULL)
2080                                 goto FastMapInFault;
2081
2082                         if ((fault_type & VM_PROT_WRITE) == 0) {
2083
2084                                 prot &= ~VM_PROT_WRITE;
2085
2086                                 /*
2087                                  *      Set up to map the page ...
2088                                  *      mark the page busy, drop
2089                                  *      locks and take a paging reference
2090                                  *      on the object with the page.
2091                                  */
2092
2093                                 if (object != cur_object) {
2094                                         vm_object_unlock(object);
2095                                         object = cur_object;
2096                                 }
2097 FastMapInFault:
2098                                 m->busy = TRUE;
2099
2100                                 vm_object_paging_begin(object);
2101                                 vm_object_unlock(object);
2102
2103 FastPmapEnter:
2104                                 /*
2105                                  *      Check a couple of global reasons to
2106                                  *      be conservative about write access.
2107                                  *      Then do the pmap_enter.
2108                                  */
2109 #if     !VM_FAULT_STATIC_CONFIG
2110                                 if (vm_fault_dirty_handling
2111 #if     MACH_KDB
2112                                     || db_watchpoint_list
2113 #endif
2114                                     && (fault_type & VM_PROT_WRITE) == 0)
2115                                         prot &= ~VM_PROT_WRITE;
2116 #else   /* STATIC_CONFIG */
2117 #if     MACH_KDB
2118                                 if (db_watchpoint_list
2119                                     && (fault_type & VM_PROT_WRITE) == 0)
2120                                         prot &= ~VM_PROT_WRITE;
2121 #endif  /* MACH_KDB */
2122 #endif  /* STATIC_CONFIG */
2123                                 PMAP_ENTER(pmap, vaddr, m, prot, wired);
2124                                 pmap_attribute(pmap,
2125                                                vaddr,
2126                                                PAGE_SIZE,
2127                                                MATTR_CACHE,
2128                                                &mv_cache_sync);
2129
2130                                 if (m->clustered) {
2131                                         vm_pagein_cluster_used++;
2132                                         m->clustered = FALSE;
2133
2134                                 }
2135                                 /*
2136                                  *      Grab the object lock to manipulate
2137                                  *      the page queues.  Change wiring
2138                                  *      case is obvious.  In soft ref bits
2139                                  *      case activate page only if it fell
2140                                  *      off paging queues, otherwise just
2141                                  *      activate it if it's inactive.
2142                                  *
2143                                  *      NOTE: original vm_fault code will
2144                                  *      move active page to back of active
2145                                  *      queue.  This code doesn't.
2146                                  */
2147                                 vm_object_lock(object);
2148                                 vm_page_lock_queues();
2149
2150                                 m->reference = TRUE;
2151
2152                                 if (change_wiring) {
2153                                         if (wired)
2154                                                 vm_page_wire(m);
2155                                         else
2156                                                 vm_page_unwire(m);
2157                                 }
2158 #if VM_FAULT_STATIC_CONFIG
2159                                 else {
2160                                         if (!m->active && !m->inactive)
2161                                                 vm_page_activate(m);
2162                                 }
2163 #else
2164                                 else if (software_reference_bits) {
2165                                         if (!m->active && !m->inactive)
2166                                                 vm_page_activate(m);
2167                                 }
2168                                 else if (!m->active) {
2169                                         vm_page_activate(m);
2170                                 }
2171 #endif
2172                                 vm_page_unlock_queues();
2173
2174                                 /*
2175                                  *      That's it, clean up and return.
2176                                  */
2177                                 PAGE_WAKEUP_DONE(m);
2178                                 vm_object_paging_end(object);
2179                                 vm_object_unlock(object);
2180                                 vm_map_unlock_read(map);
2181                                 if(pmap_map != map)
2182                                         vm_map_unlock(pmap_map);
2183
2184                                 if (funnel_set) {
2185                                         thread_funnel_set( curflock, TRUE);
2186                                         funnel_set = FALSE;
2187                                 }
2188                                 cur_thread->interruptible = interruptible_state;
2189
2190                                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2191                                                       vaddr,
2192                                                       type_of_fault,
2193                                                       KERN_SUCCESS,
2194                                                       0,
2195                                                       0);
2196                                 return KERN_SUCCESS;
2197                         }
2198
2199                         /*
2200                          *      Copy on write fault.  If objects match, then
2201                          *      object->copy must not be NULL (else control
2202                          *      would be in previous code block), and we
2203                          *      have a potential push into the copy object
2204                          *      with which we won't cope here.
2205                          */
2206
2207                         if (cur_object == object)
2208                                 break;
2209
2210                         /*
2211                          *      This is now a shadow based copy on write
2212                          *      fault -- it requires a copy up the shadow
2213                          *      chain.
2214                          *
2215                          *      Allocate a page in the original top level
2216                          *      object. Give up if allocate fails.  Also
2217                          *      need to remember current page, as it's the
2218                          *      source of the copy.
2219                          */
2220                         cur_m = m;
2221                         m = vm_page_grab();
2222                         if (m == VM_PAGE_NULL) {
2223                                 break;
2224                         }
2225
2226                         /*
2227                          *      Now do the copy.  Mark the source busy
2228                          *      and take out paging references on both
2229                          *      objects.
2230                          *
2231                          *      NOTE: This code holds the map lock across
2232                          *      the page copy.
2233                          */
2234
2235                         cur_m->busy = TRUE;
2236                         vm_page_copy(cur_m, m);
2237                         vm_page_insert(m, object, offset);
2238
2239                         vm_object_paging_begin(cur_object);
2240                         vm_object_paging_begin(object);
2241
2242                         type_of_fault = DBG_COW_FAULT;
2243                         VM_STAT(cow_faults++);
2244                         current_task()->cow_faults++;
2245
2246                         /*
2247                          *      Now cope with the source page and object
2248                          *      If the top object has a ref count of 1
2249                          *      then no other map can access it, and hence
2250                          *      it's not necessary to do the pmap_page_protect.
2251                          */
2252
2253
2254                         vm_page_lock_queues();
2255                         vm_page_deactivate(cur_m);
2256                         m->dirty = TRUE;
2257                         pmap_page_protect(cur_m->phys_addr,
2258                                                   VM_PROT_NONE);
2259                         vm_page_unlock_queues();
2260
2261                         PAGE_WAKEUP_DONE(cur_m);
2262                         vm_object_paging_end(cur_object);
2263                         vm_object_unlock(cur_object);
2264
2265                         /*
2266                          *      Slight hack to call vm_object collapse
2267                          *      and then reuse common map in code.
2268                          *      note that the object lock was taken above.
2269                          */
2270
2271                         vm_object_paging_end(object);
2272                         vm_object_collapse(object);
2273                         vm_object_paging_begin(object);
2274                         vm_object_unlock(object);
2275
2276                         goto FastPmapEnter;
2277                 }
2278                 else {
2279
2280                         /*
2281                          *      No page at cur_object, cur_offset
2282                          */
2283
2284                         if (cur_object->pager_created) {
2285
2286                                 /*
2287                                  *      Have to talk to the pager.  Give up.
2288                                  */
2289
2290                                 break;
2291                         }
2292
2293
2294                         if (cur_object->shadow == VM_OBJECT_NULL) {
2295
2296                                 if (cur_object->shadow_severed) {
2297                                         vm_object_paging_end(object);
2298                                         vm_object_unlock(object);
2299                                         vm_map_unlock_read(map);
2300                                         if(pmap_map != map)
2301                                                 vm_map_unlock(pmap_map);
2302
2303                                         if (funnel_set) {
2304                                                 thread_funnel_set( curflock, TRUE);
2305                                                 funnel_set = FALSE;
2306                                         }
2307                                         cur_thread->interruptible = interruptible_state;
2308
2309                                         return VM_FAULT_MEMORY_ERROR;
2310                                 }
2311
2312                                 /*
2313                                  *      Zero fill fault.  Page gets
2314                                  *      filled in top object. Insert
2315                                  *      page, then drop any lower lock.
2316                                  *      Give up if no page.
2317                                  */
2318                                 if ((vm_page_free_target -
2319                                    ((vm_page_free_target-vm_page_free_min)>>2))
2320                                                 > vm_page_free_count) {
2321                                         break;
2322                                 }
2323                                 m = vm_page_alloc(object, offset);
2324                                 if (m == VM_PAGE_NULL) {
2325                                         break;
2326                                 }
2327
2328                                 if (cur_object != object)
2329                                         vm_object_unlock(cur_object);
2330
2331                                 vm_object_paging_begin(object);
2332                                 vm_object_unlock(object);
2333
2334                                 /*
2335                                  *      Now zero fill page and map it.
2336                                  *      the page is probably going to
2337                                  *      be written soon, so don't bother
2338                                  *      to clear the modified bit
2339                                  *
2340                                  *      NOTE: This code holds the map
2341                                  *      lock across the zero fill.
2342                                  */
2343
2344                                 if (!map->no_zero_fill) {
2345                                         vm_page_zero_fill(m);
2346                                         type_of_fault = DBG_ZERO_FILL_FAULT;
2347                                         VM_STAT(zero_fill_count++);
2348                                 }
2349                                 vm_page_lock_queues();
2350                                 VM_PAGE_QUEUES_REMOVE(m);
2351                                 queue_enter(&vm_page_queue_inactive,
2352                                                         m, vm_page_t, pageq);
2353                                 m->inactive = TRUE;
2354                                 vm_page_inactive_count++;
2355                                 vm_page_unlock_queues();
2356                                 goto FastPmapEnter;
2357                         }
2358
2359                         /*
2360                          *      On to the next level
2361                          */
2362
2363                         cur_offset += cur_object->shadow_offset;
2364                         new_object = cur_object->shadow;
2365                         vm_object_lock(new_object);
2366                         if (cur_object != object)
2367                                 vm_object_unlock(cur_object);
2368                         cur_object = new_object;
2369
2370                         continue;
2371                 }
2372         }
2373
2374         /*
2375          *      Cleanup from fast fault failure.  Drop any object
2376          *      lock other than original and drop map lock.
2377          */
2378
2379         if (object != cur_object)
2380                 vm_object_unlock(cur_object);
2381         }
2382         vm_map_unlock_read(map);
2383         if(pmap_map != map)
2384                 vm_map_unlock(pmap_map);
2385
2386         /*
2387          *      Make a reference to this object to
2388          *      prevent its disposal while we are messing with
2389          *      it.  Once we have the reference, the map is free
2390          *      to be diddled.  Since objects reference their
2391          *      shadows (and copies), they will stay around as well.
2392          */
2393
2394         assert(object->ref_count > 0);
2395         object->ref_count++;
2396         vm_object_res_reference(object);
2397         vm_object_paging_begin(object);
2398
2399         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2400         kr = vm_fault_page(object, offset, fault_type,
2401                            (change_wiring && !wired),
2402                            interruptible,
2403                            lo_offset, hi_offset, behavior,
2404                            &prot, &result_page, &top_page,
2405                            &type_of_fault,
2406                            &error_code, map->no_zero_fill, FALSE);
2407
2408         /*
2409          *      If we didn't succeed, lose the object reference immediately.
2410          */
2411
2412         if (kr != VM_FAULT_SUCCESS)
2413                 vm_object_deallocate(object);
2414
2415         /*
2416          *      See why we failed, and take corrective action.
2417          */
2418
2419         switch (kr) {
2420                 case VM_FAULT_SUCCESS:
2421                         break;
2422                 case VM_FAULT_MEMORY_SHORTAGE:
2423                         if (vm_page_wait((change_wiring) ?
2424                                          THREAD_UNINT :
2425                                          THREAD_ABORTSAFE))
2426                                 goto RetryFault;
2427                         /* fall thru */
2428                 case VM_FAULT_INTERRUPTED:
2429                         kr = KERN_ABORTED;
2430                         goto done;
2431                 case VM_FAULT_RETRY:
2432                         goto RetryFault;
2433                 case VM_FAULT_FICTITIOUS_SHORTAGE:
2434                         vm_page_more_fictitious();
2435                         goto RetryFault;
2436                 case VM_FAULT_MEMORY_ERROR:
2437                         if (error_code)
2438                                 kr = error_code;
2439                         else
2440                                 kr = KERN_MEMORY_ERROR;
2441                         goto done;
2442         }
2443
2444         m = result_page;
2445
2446         assert((change_wiring && !wired) ?
2447                (top_page == VM_PAGE_NULL) :
2448                ((top_page == VM_PAGE_NULL) == (m->object == object)));
2449
2450         /*
2451          *      How to clean up the result of vm_fault_page.  This
2452          *      happens whether the mapping is entered or not.
2453          */
2454
2455 #define UNLOCK_AND_DEALLOCATE                           \
2456         MACRO_BEGIN                                     \
2457         vm_fault_cleanup(m->object, top_page);          \
2458         vm_object_deallocate(object);                   \
2459         MACRO_END
2460
2461         /*
2462          *      What to do with the resulting page from vm_fault_page
2463          *      if it doesn't get entered into the physical map:
2464          */
2465
2466 #define RELEASE_PAGE(m)                                 \
2467         MACRO_BEGIN                                     \
2468         PAGE_WAKEUP_DONE(m);                            \
2469         vm_page_lock_queues();                          \
2470         if (!m->active && !m->inactive)                 \
2471                 vm_page_activate(m);                    \
2472         vm_page_unlock_queues();                        \
2473         MACRO_END
2474
2475         /*
2476          *      We must verify that the maps have not changed
2477          *      since our last lookup.
2478          */
2479
2480         old_copy_object = m->object->copy;
2481
2482         vm_object_unlock(m->object);
2483         if ((map != original_map) || !vm_map_verify(map, &version)) {
2484                 vm_object_t             retry_object;
2485                 vm_object_offset_t      retry_offset;
2486                 vm_prot_t               retry_prot;
2487
2488                 /*
2489                  *      To avoid trying to write_lock the map while another
2490                  *      thread has it read_locked (in vm_map_pageable), we
2491                  *      do not try for write permission.  If the page is
2492                  *      still writable, we will get write permission.  If it
2493                  *      is not, or has been marked needs_copy, we enter the
2494                  *      mapping without write permission, and will merely
2495                  *      take another fault.
2496                  */
2497                 map = original_map;
2498                 vm_map_lock_read(map);
2499                 kr = vm_map_lookup_locked(&map, vaddr,
2500                                    fault_type & ~VM_PROT_WRITE, &version,
2501                                    &retry_object, &retry_offset, &retry_prot,
2502                                    &wired, &behavior, &lo_offset, &hi_offset,
2503                                    &pmap_map);
2504                 pmap = pmap_map->pmap;
2505
2506                 if (kr != KERN_SUCCESS) {
2507                         vm_map_unlock_read(map);
2508                         vm_object_lock(m->object);
2509                         RELEASE_PAGE(m);
2510                         UNLOCK_AND_DEALLOCATE;
2511                         goto done;
2512                 }
2513
2514                 vm_object_unlock(retry_object);
2515                 vm_object_lock(m->object);
2516
2517                 if ((retry_object != object) ||
2518                     (retry_offset != offset)) {
2519                         vm_map_unlock_read(map);
2520                         if(pmap_map != map)
2521                                 vm_map_unlock(pmap_map);
2522                         RELEASE_PAGE(m);
2523                         UNLOCK_AND_DEALLOCATE;
2524                         goto RetryFault;
2525                 }
2526
2527                 /*
2528                  *      Check whether the protection has changed or the object
2529                  *      has been copied while we left the map unlocked.
2530                  */
2531                 prot &= retry_prot;
2532                 vm_object_unlock(m->object);
2533         }
2534         vm_object_lock(m->object);
2535
2536         /*
2537          *      If the copy object changed while the top-level object
2538          *      was unlocked, then we must take away write permission.
2539          */
2540
2541         if (m->object->copy != old_copy_object)
2542                 prot &= ~VM_PROT_WRITE;
2543
2544         /*
2545          *      If we want to wire down this page, but no longer have
2546          *      adequate permissions, we must start all over.
2547          */
2548
2549         if (wired && (fault_type != (prot|VM_PROT_WRITE))) {
2550                 vm_map_verify_done(map, &version);
2551                 if(pmap_map != map)
2552                         vm_map_unlock(pmap_map);
2553                 RELEASE_PAGE(m);
2554                 UNLOCK_AND_DEALLOCATE;
2555                 goto RetryFault;
2556         }
2557
2558         /*
2559          *      It's critically important that a wired-down page be faulted
2560          *      only once in each map for which it is wired.
2561          */
2562         vm_object_unlock(m->object);
2563
2564         /*
2565          *      Put this page into the physical map.
2566          *      We had to do the unlock above because pmap_enter
2567          *      may cause other faults.  The page may be on
2568          *      the pageout queues.  If the pageout daemon comes
2569          *      across the page, it will remove it from the queues.
2570          */
2571         PMAP_ENTER(pmap, vaddr, m, prot, wired);
2572
2573         /* Sync I & D caches for new mapping*/
2574         pmap_attribute(pmap,
2575                        vaddr,
2576                        PAGE_SIZE,
2577                        MATTR_CACHE,
2578                        &mv_cache_sync);
2579
2580         /*
2581          *      If the page is not wired down and isn't already
2582          *      on a pageout queue, then put it where the
2583          *      pageout daemon can find it.
2584          */
2585         vm_object_lock(m->object);
2586         vm_page_lock_queues();
2587         if (change_wiring) {
2588                 if (wired)
2589                         vm_page_wire(m);
2590                 else
2591                         vm_page_unwire(m);
2592         }
2593 #if     VM_FAULT_STATIC_CONFIG
2594         else {
2595                 if (!m->active && !m->inactive)
2596                         vm_page_activate(m);
2597                 m->reference = TRUE;
2598         }
2599 #else
2600         else if (software_reference_bits) {
2601                 if (!m->active && !m->inactive)
2602                         vm_page_activate(m);
2603                 m->reference = TRUE;
2604         } else {
2605                 vm_page_activate(m);
2606         }
2607 #endif
2608         vm_page_unlock_queues();
2609
2610         /*
2611          *      Unlock everything, and return
2612          */
2613
2614         vm_map_verify_done(map, &version);
2615         if(pmap_map != map)
2616                 vm_map_unlock(pmap_map);
2617         PAGE_WAKEUP_DONE(m);
2618         kr = KERN_SUCCESS;
2619         UNLOCK_AND_DEALLOCATE;
2620
2621 #undef  UNLOCK_AND_DEALLOCATE
2622 #undef  RELEASE_PAGE
2623
2624     done:
2625         if (funnel_set) {
2626                 thread_funnel_set( curflock, TRUE);
2627                 funnel_set = FALSE;
2628         }
2629         cur_thread->interruptible = interruptible_state;
2630
2631         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2632                               vaddr,
2633                               type_of_fault,
2634                               kr,
2635                               0,
2636                               0);
2637         return(kr);
2638 }
2639
2640 /*
2641  *      vm_fault_wire:
2642  *
2643  *      Wire down a range of virtual addresses in a map.
2644  */
2645 kern_return_t
2646 vm_fault_wire(
2647         vm_map_t        map,
2648         vm_map_entry_t  entry,
2649         pmap_t          pmap)
2650 {
2651
2652         register vm_offset_t    va;
2653         register vm_offset_t    end_addr = entry->vme_end;
2654         register kern_return_t  rc;
2655
2656         assert(entry->in_transition);
2657
2658         /*
2659          *      Inform the physical mapping system that the
2660          *      range of addresses may not fault, so that
2661          *      page tables and such can be locked down as well.
2662          */
2663
2664         pmap_pageable(pmap, entry->vme_start, end_addr, FALSE);
2665
2666         /*
2667          *      We simulate a fault to get the page and enter it
2668          *      in the physical map.
2669          */
2670
2671         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
2672                 if ((rc = vm_fault_wire_fast(
2673                                 map, va, entry, pmap)) != KERN_SUCCESS) {
2674                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
2675                                       (pmap == kernel_pmap) ? THREAD_UNINT : THREAD_ABORTSAFE);
2676                 }
2677
2678                 if (rc != KERN_SUCCESS) {
2679                         struct vm_map_entry     tmp_entry = *entry;
2680
2681                         /* unwire wired pages */
2682                         tmp_entry.vme_end = va;
2683                         vm_fault_unwire(map, &tmp_entry, FALSE, pmap);
2684
2685                         return rc;
2686                 }
2687         }
2688         return KERN_SUCCESS;
2689 }
2690
2691 /*
2692  *      vm_fault_unwire:
2693  *
2694  *      Unwire a range of virtual addresses in a map.
2695  */
2696 void
2697 vm_fault_unwire(
2698         vm_map_t        map,
2699         vm_map_entry_t  entry,
2700         boolean_t       deallocate,
2701         pmap_t          pmap)
2702 {
2703         register vm_offset_t    va;
2704         register vm_offset_t    end_addr = entry->vme_end;
2705         vm_object_t             object;
2706
2707         object = (entry->is_sub_map)
2708                         ? VM_OBJECT_NULL : entry->object.vm_object;
2709
2710         /*
2711          *      Since the pages are wired down, we must be able to
2712          *      get their mappings from the physical map system.
2713          */
2714
2715         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
2716                 pmap_change_wiring(pmap, va, FALSE);
2717
2718                 if (object == VM_OBJECT_NULL) {
2719                         (void) vm_fault(map, va, VM_PROT_NONE, TRUE, THREAD_UNINT);
2720                 } else {
2721                         vm_prot_t       prot;
2722                         vm_page_t       result_page;
2723                         vm_page_t       top_page;
2724                         vm_object_t     result_object;
2725                         vm_fault_return_t result;
2726
2727                         do {
2728                                 prot = VM_PROT_NONE;
2729
2730                                 vm_object_lock(object);
2731                                 vm_object_paging_begin(object);
2732                                 XPR(XPR_VM_FAULT,
2733                                         "vm_fault_unwire -> vm_fault_page\n",
2734                                         0,0,0,0,0);
2735                                 result = vm_fault_page(object,
2736                                                 entry->offset +
2737                                                   (va - entry->vme_start),
2738                                                 VM_PROT_NONE, TRUE,
2739                                                 THREAD_UNINT,
2740                                                 entry->offset,
2741                                                 entry->offset +
2742                                                        (entry->vme_end
2743                                                         - entry->vme_start),
2744                                                 entry->behavior,
2745                                                 &prot,
2746                                                 &result_page,
2747                                                 &top_page,
2748                                                 (int *)0,
2749                                                 0, map->no_zero_fill,
2750                                                 FALSE);
2751                         } while (result == VM_FAULT_RETRY);
2752
2753                         if (result != VM_FAULT_SUCCESS)
2754                                 panic("vm_fault_unwire: failure");
2755
2756                         result_object = result_page->object;
2757                         if (deallocate) {
2758                                 assert(!result_page->fictitious);
2759                                 pmap_page_protect(result_page->phys_addr,
2760                                                 VM_PROT_NONE);
2761                                 VM_PAGE_FREE(result_page);
2762                         } else {
2763                                 vm_page_lock_queues();
2764                                 vm_page_unwire(result_page);
2765                                 vm_page_unlock_queues();
2766                                 PAGE_WAKEUP_DONE(result_page);
2767                         }
2768
2769                         vm_fault_cleanup(result_object, top_page);
2770                 }
2771         }
2772
2773         /*
2774          *      Inform the physical mapping system that the range
2775          *      of addresses may fault, so that page tables and
2776          *      such may be unwired themselves.
2777          */
2778
2779         pmap_pageable(pmap, entry->vme_start, end_addr, TRUE);
2780
2781 }
2782
2783 /*
2784  *      vm_fault_wire_fast:
2785  *
2786  *      Handle common case of a wire down page fault at the given address.
2787  *      If successful, the page is inserted into the associated physical map.
2788  *      The map entry is passed in to avoid the overhead of a map lookup.
2789  *
2790  *      NOTE: the given address should be truncated to the
2791  *      proper page address.
2792  *
2793  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
2794  *      a standard error specifying why the fault is fatal is returned.
2795  *
2796  *      The map in question must be referenced, and remains so.
2797  *      Caller has a read lock on the map.
2798  *
2799  *      This is a stripped version of vm_fault() for wiring pages.  Anything
2800  *      other than the common case will return KERN_FAILURE, and the caller
2801  *      is expected to call vm_fault().
2802  */
2803 kern_return_t
2804 vm_fault_wire_fast(
2805         vm_map_t        map,
2806         vm_offset_t     va,
2807         vm_map_entry_t  entry,
2808         pmap_t          pmap)
2809 {
2810         vm_object_t             object;
2811         vm_object_offset_t      offset;
2812         register vm_page_t      m;
2813         vm_prot_t               prot;
2814         thread_act_t            thr_act;
2815
2816         VM_STAT(faults++);
2817
2818         if((thr_act=current_act()) && (thr_act->task != TASK_NULL))
2819           thr_act->task->faults++;
2820
2821 /*
2822  *      Recovery actions
2823  */
2824
2825 #undef  RELEASE_PAGE
2826 #define RELEASE_PAGE(m) {                               \
2827         PAGE_WAKEUP_DONE(m);                            \
2828         vm_page_lock_queues();                          \
2829         vm_page_unwire(m);                              \
2830         vm_page_unlock_queues();                        \
2831 }
2832
2833
2834 #undef  UNLOCK_THINGS
2835 #define UNLOCK_THINGS   {                               \
2836         object->paging_in_progress--;                   \
2837         vm_object_unlock(object);                       \
2838 }
2839
2840 #undef  UNLOCK_AND_DEALLOCATE
2841 #define UNLOCK_AND_DEALLOCATE   {                       \
2842         UNLOCK_THINGS;                                  \
2843         vm_object_deallocate(object);                   \
2844 }
2845 /*
2846  *      Give up and have caller do things the hard way.
2847  */
2848
2849 #define GIVE_UP {                                       \
2850         UNLOCK_AND_DEALLOCATE;                          \
2851         return(KERN_FAILURE);                           \
2852 }
2853
2854
2855         /*
2856          *      If this entry is not directly to a vm_object, bail out.
2857          */
2858         if (entry->is_sub_map)
2859                 return(KERN_FAILURE);
2860
2861         /*
2862          *      Find the backing store object and offset into it.
2863          */
2864
2865         object = entry->object.vm_object;
2866         offset = (va - entry->vme_start) + entry->offset;
2867         prot = entry->protection;
2868
2869         /*
2870          *      Make a reference to this object to prevent its
2871          *      disposal while we are messing with it.
2872          */
2873
2874         vm_object_lock(object);
2875         assert(object->ref_count > 0);
2876         object->ref_count++;
2877         vm_object_res_reference(object);
2878         object->paging_in_progress++;
2879
2880         /*
2881          *      INVARIANTS (through entire routine):
2882          *
2883          *      1)      At all times, we must either have the object
2884          *              lock or a busy page in some object to prevent
2885          *              some other thread from trying to bring in
2886          *              the same page.
2887          *
2888          *      2)      Once we have a busy page, we must remove it from
2889          *              the pageout queues, so that the pageout daemon
2890          *              will not grab it away.
2891          *
2892          */
2893
2894         /*
2895          *      Look for page in top-level object.  If it's not there or
2896          *      there's something going on, give up.
2897          */
2898         m = vm_page_lookup(object, offset);
2899         if ((m == VM_PAGE_NULL) || (m->busy) ||
2900             (m->unusual && ( m->error || m->restart || m->absent ||
2901                                 prot & m->page_lock))) {
2902
2903                 GIVE_UP;
2904         }
2905
2906         /*
2907          *      Wire the page down now.  All bail outs beyond this
2908          *      point must unwire the page.
2909          */
2910
2911         vm_page_lock_queues();
2912         vm_page_wire(m);
2913         vm_page_unlock_queues();
2914
2915         /*
2916          *      Mark page busy for other threads.
2917          */
2918         assert(!m->busy);
2919         m->busy = TRUE;
2920         assert(!m->absent);
2921
2922         /*
2923          *      Give up if the page is being written and there's a copy object
2924          */
2925         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
2926                 RELEASE_PAGE(m);
2927                 GIVE_UP;
2928         }
2929
2930         /*
2931          *      Put this page into the physical map.
2932          *      We have to unlock the object because pmap_enter
2933          *      may cause other faults.
2934          */
2935         vm_object_unlock(object);
2936
2937         PMAP_ENTER(pmap, va, m, prot, TRUE);
2938         /* Sync I & D caches for new mapping */
2939         pmap_attribute(pmap,
2940                        va,
2941                        PAGE_SIZE,
2942                        MATTR_CACHE,
2943                        &mv_cache_sync);
2944
2945         /*
2946          *      Must relock object so that paging_in_progress can be cleared.
2947          */
2948         vm_object_lock(object);
2949
2950         /*
2951          *      Unlock everything, and return
2952          */
2953
2954         PAGE_WAKEUP_DONE(m);
2955         UNLOCK_AND_DEALLOCATE;
2956
2957         return(KERN_SUCCESS);
2958
2959 }
2960
2961 /*
2962  *      Routine:        vm_fault_copy_cleanup
2963  *      Purpose:
2964  *              Release a page used by vm_fault_copy.
2965  */
2966
2967 void
2968 vm_fault_copy_cleanup(
2969         vm_page_t       page,
2970         vm_page_t       top_page)
2971 {
2972         vm_object_t     object = page->object;
2973
2974         vm_object_lock(object);
2975         PAGE_WAKEUP_DONE(page);
2976         vm_page_lock_queues();
2977         if (!page->active && !page->inactive)
2978                 vm_page_activate(page);
2979         vm_page_unlock_queues();
2980         vm_fault_cleanup(object, top_page);
2981 }
2982
2983 void
2984 vm_fault_copy_dst_cleanup(
2985         vm_page_t       page)
2986 {
2987         vm_object_t     object;
2988
2989         if (page != VM_PAGE_NULL) {
2990                 object = page->object;
2991                 vm_object_lock(object);
2992                 vm_page_lock_queues();
2993                 vm_page_unwire(page);
2994                 vm_page_unlock_queues();
2995                 vm_object_paging_end(object);
2996                 vm_object_unlock(object);
2997         }
2998 }
2999
3000 /*
3001  *      Routine:        vm_fault_copy
3002  *
3003  *      Purpose:
3004  *              Copy pages from one virtual memory object to another --
3005  *              neither the source nor destination pages need be resident.
3006  *
3007  *              Before actually copying a page, the version associated with
3008  *              the destination address map wil be verified.
3009  *
3010  *      In/out conditions:
3011  *              The caller must hold a reference, but not a lock, to
3012  *              each of the source and destination objects and to the
3013  *              destination map.
3014  *
3015  *      Results:
3016  *              Returns KERN_SUCCESS if no errors were encountered in
3017  *              reading or writing the data.  Returns KERN_INTERRUPTED if
3018  *              the operation was interrupted (only possible if the
3019  *              "interruptible" argument is asserted).  Other return values
3020  *              indicate a permanent error in copying the data.
3021  *
3022  *              The actual amount of data copied will be returned in the
3023  *              "copy_size" argument.  In the event that the destination map
3024  *              verification failed, this amount may be less than the amount
3025  *              requested.
3026  */
3027 kern_return_t
3028 vm_fault_copy(
3029         vm_object_t             src_object,
3030         vm_object_offset_t      src_offset,
3031         vm_size_t               *src_size,              /* INOUT */
3032         vm_object_t             dst_object,
3033         vm_object_offset_t      dst_offset,
3034         vm_map_t                dst_map,
3035         vm_map_version_t         *dst_version,
3036         int                     interruptible)
3037 {
3038         vm_page_t               result_page;
3039
3040         vm_page_t               src_page;
3041         vm_page_t               src_top_page;
3042         vm_prot_t               src_prot;
3043
3044         vm_page_t               dst_page;
3045         vm_page_t               dst_top_page;
3046         vm_prot_t               dst_prot;
3047
3048         vm_size_t               amount_left;
3049         vm_object_t             old_copy_object;
3050         kern_return_t           error = 0;
3051
3052         vm_size_t               part_size;
3053
3054         /*
3055          * In order not to confuse the clustered pageins, align
3056          * the different offsets on a page boundary.
3057          */
3058         vm_object_offset_t      src_lo_offset = trunc_page_64(src_offset);
3059         vm_object_offset_t      dst_lo_offset = trunc_page_64(dst_offset);
3060         vm_object_offset_t      src_hi_offset = round_page_64(src_offset + *src_size);
3061         vm_object_offset_t      dst_hi_offset = round_page_64(dst_offset + *src_size);
3062
3063 #define RETURN(x)                                       \
3064         MACRO_BEGIN                                     \
3065         *src_size -= amount_left;                       \
3066         MACRO_RETURN(x);                                \
3067         MACRO_END
3068
3069         amount_left = *src_size;
3070         do { /* while (amount_left > 0) */
3071                 /*
3072                  * There may be a deadlock if both source and destination
3073                  * pages are the same. To avoid this deadlock, the copy must
3074                  * start by getting the destination page in order to apply
3075                  * COW semantics if any.
3076                  */
3077
3078         RetryDestinationFault: ;
3079
3080                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3081
3082                 vm_object_lock(dst_object);
3083                 vm_object_paging_begin(dst_object);
3084
3085                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3086                 switch (vm_fault_page(dst_object,
3087                                       trunc_page_64(dst_offset),
3088                                       VM_PROT_WRITE|VM_PROT_READ,
3089                                       FALSE,
3090                                       interruptible,
3091                                       dst_lo_offset,
3092                                       dst_hi_offset,
3093                                       VM_BEHAVIOR_SEQUENTIAL,
3094                                       &dst_prot,
3095                                       &dst_page,
3096                                       &dst_top_page,
3097                                       (int *)0,
3098                                       &error,
3099                                       dst_map->no_zero_fill,
3100                                       FALSE)) {
3101                 case VM_FAULT_SUCCESS:
3102                         break;
3103                 case VM_FAULT_RETRY:
3104                         goto RetryDestinationFault;
3105                 case VM_FAULT_MEMORY_SHORTAGE:
3106                         if (vm_page_wait(interruptible))
3107                                 goto RetryDestinationFault;
3108                         /* fall thru */
3109                 case VM_FAULT_INTERRUPTED:
3110                         RETURN(MACH_SEND_INTERRUPTED);
3111                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3112                         vm_page_more_fictitious();
3113                         goto RetryDestinationFault;
3114                 case VM_FAULT_MEMORY_ERROR:
3115                         if (error)
3116                                 return (error);
3117                         else
3118                                 return(KERN_MEMORY_ERROR);
3119                 }
3120                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3121
3122                 old_copy_object = dst_page->object->copy;
3123
3124                 /*
3125                  * There exists the possiblity that the source and
3126                  * destination page are the same.  But we can't
3127                  * easily determine that now.  If they are the
3128                  * same, the call to vm_fault_page() for the
3129                  * destination page will deadlock.  To prevent this we
3130                  * wire the page so we can drop busy without having
3131                  * the page daemon steal the page.  We clean up the
3132                  * top page  but keep the paging reference on the object
3133                  * holding the dest page so it doesn't go away.
3134                  */
3135
3136                 vm_page_lock_queues();
3137                 vm_page_wire(dst_page);
3138                 vm_page_unlock_queues();
3139                 PAGE_WAKEUP_DONE(dst_page);
3140                 vm_object_unlock(dst_page->object);
3141
3142                 if (dst_top_page != VM_PAGE_NULL) {
3143                         vm_object_lock(dst_object);
3144                         VM_PAGE_FREE(dst_top_page);
3145                         vm_object_paging_end(dst_object);
3146                         vm_object_unlock(dst_object);
3147                 }
3148
3149         RetrySourceFault: ;
3150
3151                 if (src_object == VM_OBJECT_NULL) {
3152                         /*
3153                          *      No source object.  We will just
3154                          *      zero-fill the page in dst_object.
3155                          */
3156                         src_page = VM_PAGE_NULL;
3157                 } else {
3158                         vm_object_lock(src_object);
3159                         src_page = vm_page_lookup(src_object,
3160                                                   trunc_page_64(src_offset));
3161                         if (src_page == dst_page)
3162                                 src_prot = dst_prot;
3163                         else {
3164                                 src_prot = VM_PROT_READ;
3165                                 vm_object_paging_begin(src_object);
3166
3167                                 XPR(XPR_VM_FAULT,
3168                                         "vm_fault_copy(2) -> vm_fault_page\n",
3169                                         0,0,0,0,0);
3170                                 switch (vm_fault_page(src_object,
3171                                                       trunc_page_64(src_offset),
3172                                                       VM_PROT_READ,
3173                                                       FALSE,
3174                                                       interruptible,
3175                                                       src_lo_offset,
3176                                                       src_hi_offset,
3177                                                       VM_BEHAVIOR_SEQUENTIAL,
3178                                                       &src_prot,
3179                                                       &result_page,
3180                                                       &src_top_page,
3181                                                       (int *)0,
3182                                                       &error,
3183                                                       FALSE,
3184                                                       FALSE)) {
3185
3186                                 case VM_FAULT_SUCCESS:
3187                                         break;
3188                                 case VM_FAULT_RETRY:
3189                                         goto RetrySourceFault;
3190                                 case VM_FAULT_MEMORY_SHORTAGE:
3191                                         if (vm_page_wait(interruptible))
3192                                                 goto RetrySourceFault;
3193                                         /* fall thru */
3194                                 case VM_FAULT_INTERRUPTED:
3195                                         vm_fault_copy_dst_cleanup(dst_page);
3196                                         RETURN(MACH_SEND_INTERRUPTED);
3197                                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3198                                         vm_page_more_fictitious();
3199                                         goto RetrySourceFault;
3200                                 case VM_FAULT_MEMORY_ERROR:
3201                                         vm_fault_copy_dst_cleanup(dst_page);
3202                                         if (error)
3203                                                 return (error);
3204                                         else
3205                                                 return(KERN_MEMORY_ERROR);
3206                                 }
3207
3208                                 src_page = result_page;
3209
3210                                 assert((src_top_page == VM_PAGE_NULL) ==
3211                                        (src_page->object == src_object));
3212                         }
3213                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3214                         vm_object_unlock(src_page->object);
3215                 }
3216
3217                 if (!vm_map_verify(dst_map, dst_version)) {
3218                         if (src_page != VM_PAGE_NULL && src_page != dst_page)
3219                                 vm_fault_copy_cleanup(src_page, src_top_page);
3220                         vm_fault_copy_dst_cleanup(dst_page);
3221                         break;
3222                 }
3223
3224                 vm_object_lock(dst_page->object);
3225
3226                 if (dst_page->object->copy != old_copy_object) {
3227                         vm_object_unlock(dst_page->object);
3228                         vm_map_verify_done(dst_map, dst_version);
3229                         if (src_page != VM_PAGE_NULL && src_page != dst_page)
3230                                 vm_fault_copy_cleanup(src_page, src_top_page);
3231                         vm_fault_copy_dst_cleanup(dst_page);
3232                         break;
3233                 }
3234                 vm_object_unlock(dst_page->object);
3235
3236                 /*
3237                  *      Copy the page, and note that it is dirty
3238                  *      immediately.
3239                  */
3240
3241                 if (!page_aligned(src_offset) ||
3242                         !page_aligned(dst_offset) ||
3243                         !page_aligned(amount_left)) {
3244
3245                         vm_object_offset_t      src_po,
3246                                                 dst_po;
3247
3248                         src_po = src_offset - trunc_page_64(src_offset);
3249                         dst_po = dst_offset - trunc_page_64(dst_offset);
3250
3251                         if (dst_po > src_po) {
3252                                 part_size = PAGE_SIZE - dst_po;
3253                         } else {
3254                                 part_size = PAGE_SIZE - src_po;
3255                         }
3256                         if (part_size > (amount_left)){
3257                                 part_size = amount_left;
3258                         }
3259
3260                         if (src_page == VM_PAGE_NULL) {
3261                                 vm_page_part_zero_fill(dst_page,
3262                                                         dst_po, part_size);
3263                         } else {
3264                                 vm_page_part_copy(src_page, src_po,
3265                                         dst_page, dst_po, part_size);
3266                                 if(!dst_page->dirty){
3267                                         vm_object_lock(dst_object);
3268                                         dst_page->dirty = TRUE;
3269                                         vm_object_unlock(dst_page->object);
3270                                 }
3271
3272                         }
3273                 } else {
3274                         part_size = PAGE_SIZE;
3275
3276                         if (src_page == VM_PAGE_NULL)
3277                                 vm_page_zero_fill(dst_page);
3278                         else{
3279                                 vm_page_copy(src_page, dst_page);
3280                                 if(!dst_page->dirty){
3281                                         vm_object_lock(dst_object);
3282                                         dst_page->dirty = TRUE;
3283                                         vm_object_unlock(dst_page->object);
3284                                 }
3285                         }
3286
3287                 }
3288
3289                 /*
3290                  *      Unlock everything, and return
3291                  */
3292
3293                 vm_map_verify_done(dst_map, dst_version);
3294
3295                 if (src_page != VM_PAGE_NULL && src_page != dst_page)
3296                         vm_fault_copy_cleanup(src_page, src_top_page);
3297                 vm_fault_copy_dst_cleanup(dst_page);
3298
3299                 amount_left -= part_size;
3300                 src_offset += part_size;
3301                 dst_offset += part_size;
3302         } while (amount_left > 0);
3303
3304         RETURN(KERN_SUCCESS);
3305 #undef  RETURN
3306
3307         /*NOTREACHED*/
3308 }
3309
3310 #ifdef  notdef
3311
3312 /*
3313  *      Routine:        vm_fault_page_overwrite
3314  *
3315  *      Description:
3316  *              A form of vm_fault_page that assumes that the
3317  *              resulting page will be overwritten in its entirety,
3318  *              making it unnecessary to obtain the correct *contents*
3319  *              of the page.
3320  *
3321  *      Implementation:
3322  *              XXX Untested.  Also unused.  Eventually, this technology
3323  *              could be used in vm_fault_copy() to advantage.
3324  */
3325 vm_fault_return_t
3326 vm_fault_page_overwrite(
3327         register
3328         vm_object_t             dst_object,
3329         vm_object_offset_t      dst_offset,
3330         vm_page_t               *result_page)   /* OUT */
3331 {
3332         register
3333         vm_page_t       dst_page;
3334         kern_return_t   wait_result;
3335
3336 #define interruptible   THREAD_UNINT    /* XXX */
3337
3338         while (TRUE) {
3339                 /*
3340                  *      Look for a page at this offset
3341                  */
3342
3343                 while ((dst_page = vm_page_lookup(dst_object, dst_offset))
3344                                  == VM_PAGE_NULL) {
3345                         /*
3346                          *      No page, no problem... just allocate one.
3347                          */
3348
3349                         dst_page = vm_page_alloc(dst_object, dst_offset);
3350                         if (dst_page == VM_PAGE_NULL) {
3351                                 vm_object_unlock(dst_object);
3352                                 VM_PAGE_WAIT();
3353                                 vm_object_lock(dst_object);
3354                                 continue;
3355                         }
3356
3357                         /*
3358                          *      Pretend that the memory manager
3359                          *      write-protected the page.
3360                          *
3361                          *      Note that we will be asking for write
3362                          *      permission without asking for the data
3363                          *      first.
3364                          */
3365
3366                         dst_page->overwriting = TRUE;
3367                         dst_page->page_lock = VM_PROT_WRITE;
3368                         dst_page->absent = TRUE;
3369                         dst_page->unusual = TRUE;
3370                         dst_object->absent_count++;
3371
3372                         break;
3373
3374                         /*
3375                          *      When we bail out, we might have to throw
3376                          *      away the page created here.
3377                          */
3378
3379 #define DISCARD_PAGE                                            \
3380         MACRO_BEGIN                                             \
3381         vm_object_lock(dst_object);                             \
3382         dst_page = vm_page_lookup(dst_object, dst_offset);      \
3383         if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \
3384                 VM_PAGE_FREE(dst_page);                         \
3385         vm_object_unlock(dst_object);                           \
3386         MACRO_END
3387                 }
3388
3389                 /*
3390                  *      If the page is write-protected...
3391                  */
3392
3393                 if (dst_page->page_lock & VM_PROT_WRITE) {
3394                         /*
3395                          *      ... and an unlock request hasn't been sent
3396                          */
3397
3398                         if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) {
3399                                 vm_prot_t       u;
3400                                 kern_return_t   rc;
3401
3402                                 /*
3403                                  *      ... then send one now.
3404                                  */
3405
3406                                 if (!dst_object->pager_ready) {
3407                                         vm_object_assert_wait(dst_object,
3408                                                 VM_OBJECT_EVENT_PAGER_READY,
3409                                                 interruptible);
3410                                         vm_object_unlock(dst_object);
3411                                         wait_result = thread_block((void (*)(void))0);
3412                                         if (wait_result != THREAD_AWAKENED) {
3413                                                 DISCARD_PAGE;
3414                                                 return(VM_FAULT_INTERRUPTED);
3415                                         }
3416                                         continue;
3417                                 }
3418
3419                                 u = dst_page->unlock_request |= VM_PROT_WRITE;
3420                                 vm_object_unlock(dst_object);
3421
3422                                 if ((rc = memory_object_data_unlock(
3423                                                 dst_object->pager,
3424                                                 dst_object->pager_request,
3425                                                 dst_offset + dst_object->paging_offset,
3426                                                 PAGE_SIZE,
3427                                                 u)) != KERN_SUCCESS) {
3428                                         if (vm_fault_debug)
3429                                             printf("vm_object_overwrite: memory_object_data_unlock failed\n");
3430                                         DISCARD_PAGE;
3431                                         return((rc == MACH_SEND_INTERRUPTED) ?
3432                                                 VM_FAULT_INTERRUPTED :
3433                                                 VM_FAULT_MEMORY_ERROR);
3434                                 }
3435                                 vm_object_lock(dst_object);
3436                                 continue;
3437                         }
3438
3439                         /* ... fall through to wait below */
3440                 } else {
3441                         /*
3442                          *      If the page isn't being used for other
3443                          *      purposes, then we're done.
3444                          */
3445                         if ( ! (dst_page->busy || dst_page->absent ||
3446                                 dst_page->error || dst_page->restart) )
3447                                 break;
3448                 }
3449
3450                 PAGE_ASSERT_WAIT(dst_page, interruptible);
3451                 vm_object_unlock(dst_object);
3452                 wait_result = thread_block((void (*)(void))0);
3453                 if (wait_result != THREAD_AWAKENED) {
3454                         DISCARD_PAGE;
3455                         return(VM_FAULT_INTERRUPTED);
3456                 }
3457         }
3458
3459         *result_page = dst_page;
3460         return(VM_FAULT_SUCCESS);
3461
3462 #undef  interruptible
3463 #undef  DISCARD_PAGE
3464 }
3465
3466 #endif  /* notdef */
3467
3468 #if     VM_FAULT_CLASSIFY
3469 /*
3470  *      Temporary statistics gathering support.
3471  */
3472
3473 /*
3474  *      Statistics arrays:
3475  */
3476 #define VM_FAULT_TYPES_MAX      5
3477 #define VM_FAULT_LEVEL_MAX      8
3478
3479 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
3480
3481 #define VM_FAULT_TYPE_ZERO_FILL 0
3482 #define VM_FAULT_TYPE_MAP_IN    1
3483 #define VM_FAULT_TYPE_PAGER     2
3484 #define VM_FAULT_TYPE_COPY      3
3485 #define VM_FAULT_TYPE_OTHER     4
3486
3487
3488 void
3489 vm_fault_classify(vm_object_t           object,
3490                   vm_object_offset_t    offset,
3491                   vm_prot_t             fault_type)
3492 {
3493         int             type, level = 0;
3494         vm_page_t       m;
3495
3496         while (TRUE) {
3497                 m = vm_page_lookup(object, offset);
3498                 if (m != VM_PAGE_NULL) {
3499                         if (m->busy || m->error || m->restart || m->absent ||
3500                             fault_type & m->page_lock) {
3501                                 type = VM_FAULT_TYPE_OTHER;
3502                                 break;
3503                         }
3504                         if (((fault_type & VM_PROT_WRITE) == 0) ||
3505                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
3506                                 type = VM_FAULT_TYPE_MAP_IN;
3507                                 break;
3508                         }
3509                         type = VM_FAULT_TYPE_COPY;
3510                         break;
3511                 }
3512                 else {
3513                         if (object->pager_created) {
3514                                 type = VM_FAULT_TYPE_PAGER;
3515                                 break;
3516                         }
3517                         if (object->shadow == VM_OBJECT_NULL) {
3518                                 type = VM_FAULT_TYPE_ZERO_FILL;
3519                                 break;
3520                         }
3521
3522                         offset += object->shadow_offset;
3523                         object = object->shadow;
3524                         level++;
3525                         continue;
3526                 }
3527         }
3528
3529         if (level > VM_FAULT_LEVEL_MAX)
3530                 level = VM_FAULT_LEVEL_MAX;
3531
3532         vm_fault_stats[type][level] += 1;
3533
3534         return;
3535 }
3536
3537 /* cleanup routine to call from debugger */
3538
3539 void
3540 vm_fault_classify_init(void)
3541 {
3542         int type, level;
3543
3544         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
3545                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
3546                         vm_fault_stats[type][level] = 0;
3547                 }
3548         }
3549
3550         return;
3551 }
3552 #endif  /* VM_FAULT_CLASSIFY */