osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*
  23  * @OSF_COPYRIGHT@
  24  */
  25 /*
  26  * Mach Operating System
  27  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  28  * All Rights Reserved.
  29  *
  30  * Permission to use, copy, modify and distribute this software and its
  31  * documentation is hereby granted, provided that both the copyright
  32  * notice and this permission notice appear in all copies of the
  33  * software, derivative works or modified versions, and any portions
  34  * thereof, and that both notices appear in supporting documentation.
  35  *
  36  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  37  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  38  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  39  *
  40  * Carnegie Mellon requests users of this software to return to
  41  *
  42  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  43  *  School of Computer Science
  44  *  Carnegie Mellon University
  45  *  Pittsburgh PA 15213-3890
  46  *
  47  * any improvements or extensions that they make and grant Carnegie Mellon
  48  * the rights to redistribute these changes.
  49  */
  50 /*
  51  */
  52 /*
  53  *      File:   vm_fault.c
  54  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  55  *
  56  *      Page fault handling module.
  57  */
  58 #ifdef MACH_BSD
  59 /* remove after component interface available */
  60 extern int      vnode_pager_workaround;
  61 extern int      device_pager_workaround;
  62 #endif
  63
  64 #include <mach_cluster_stats.h>
  65 #include <mach_pagemap.h>
  66 #include <mach_kdb.h>
  67
  68 #include <vm/vm_fault.h>
  69 #include <mach/kern_return.h>
  70 #include <mach/message.h>       /* for error codes */
  71 #include <kern/host_statistics.h>
  72 #include <kern/counters.h>
  73 #include <kern/task.h>
  74 #include <kern/thread.h>
  75 #include <kern/sched_prim.h>
  76 #include <kern/host.h>
  77 #include <kern/xpr.h>
  78 #include <ppc/proc_reg.h>
  79 #include <ppc/pmap_internals.h>
  80 #include <vm/task_working_set.h>
  81 #include <vm/vm_map.h>
  82 #include <vm/vm_object.h>
  83 #include <vm/vm_page.h>
  84 #include <vm/pmap.h>
  85 #include <vm/vm_pageout.h>
  86 #include <mach/vm_param.h>
  87 #include <mach/vm_behavior.h>
  88 #include <mach/memory_object.h>
  89                                 /* For memory_object_data_{request,unlock} */
  90 #include <kern/mach_param.h>
  91 #include <kern/macro_help.h>
  92 #include <kern/zalloc.h>
  93 #include <kern/misc_protos.h>
  94
  95 #include <sys/kdebug.h>
  96
  97 #define VM_FAULT_CLASSIFY       0
  98 #define VM_FAULT_STATIC_CONFIG  1
  99
 100 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 101
 102 int             vm_object_absent_max = 50;
 103
 104 int             vm_fault_debug = 0;
 105 boolean_t       vm_page_deactivate_behind = TRUE;
 106
 107 vm_machine_attribute_val_t mv_cache_sync = MATTR_VAL_CACHE_SYNC;
 108
 109 #if     !VM_FAULT_STATIC_CONFIG
 110 boolean_t       vm_fault_dirty_handling = FALSE;
 111 boolean_t       vm_fault_interruptible = FALSE;
 112 boolean_t       software_reference_bits = TRUE;
 113 #endif
 114
 115 #if     MACH_KDB
 116 extern struct db_watchpoint *db_watchpoint_list;
 117 #endif  /* MACH_KDB */
 118
 119 /* Forward declarations of internal routines. */
 120 extern kern_return_t vm_fault_wire_fast(
 121                                 vm_map_t        map,
 122                                 vm_offset_t     va,
 123                                 vm_map_entry_t  entry,
 124                                 pmap_t          pmap);
 125
 126 extern void vm_fault_continue(void);
 127
 128 extern void vm_fault_copy_cleanup(
 129                                 vm_page_t       page,
 130                                 vm_page_t       top_page);
 131
 132 extern void vm_fault_copy_dst_cleanup(
 133                                 vm_page_t       page);
 134
 135 #if     VM_FAULT_CLASSIFY
 136 extern void vm_fault_classify(vm_object_t       object,
 137                           vm_object_offset_t    offset,
 138                           vm_prot_t             fault_type);
 139
 140 extern void vm_fault_classify_init(void);
 141 #endif
 142
 143 /*
 144  *      Routine:        vm_fault_init
 145  *      Purpose:
 146  *              Initialize our private data structures.
 147  */
 148 void
 149 vm_fault_init(void)
 150 {
 151 }
 152
 153 /*
 154  *      Routine:        vm_fault_cleanup
 155  *      Purpose:
 156  *              Clean up the result of vm_fault_page.
 157  *      Results:
 158  *              The paging reference for "object" is released.
 159  *              "object" is unlocked.
 160  *              If "top_page" is not null,  "top_page" is
 161  *              freed and the paging reference for the object
 162  *              containing it is released.
 163  *
 164  *      In/out conditions:
 165  *              "object" must be locked.
 166  */
 167 void
 168 vm_fault_cleanup(
 169         register vm_object_t    object,
 170         register vm_page_t      top_page)
 171 {
 172         vm_object_paging_end(object);
 173         vm_object_unlock(object);
 174
 175         if (top_page != VM_PAGE_NULL) {
 176             object = top_page->object;
 177             vm_object_lock(object);
 178             VM_PAGE_FREE(top_page);
 179             vm_object_paging_end(object);
 180             vm_object_unlock(object);
 181         }
 182 }
 183
 184 #if     MACH_CLUSTER_STATS
 185 #define MAXCLUSTERPAGES 16
 186 struct {
 187         unsigned long pages_in_cluster;
 188         unsigned long pages_at_higher_offsets;
 189         unsigned long pages_at_lower_offsets;
 190 } cluster_stats_in[MAXCLUSTERPAGES];
 191 #define CLUSTER_STAT(clause)    clause
 192 #define CLUSTER_STAT_HIGHER(x)  \
 193         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 194 #define CLUSTER_STAT_LOWER(x)   \
 195          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 196 #define CLUSTER_STAT_CLUSTER(x) \
 197         ((cluster_stats_in[(x)].pages_in_cluster)++)
 198 #else   /* MACH_CLUSTER_STATS */
 199 #define CLUSTER_STAT(clause)
 200 #endif  /* MACH_CLUSTER_STATS */
 201
 202 /* XXX - temporary */
 203 boolean_t vm_allow_clustered_pagein = FALSE;
 204 int vm_pagein_cluster_used = 0;
 205
 206 /*
 207  * Prepage default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 208  */
 209 int vm_default_ahead = 1;       /* Number of pages to prepage ahead */
 210 int vm_default_behind = 0;      /* Number of pages to prepage behind */
 211
 212 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 213
 214 /*
 215  *      Routine:        vm_fault_page
 216  *      Purpose:
 217  *              Find the resident page for the virtual memory
 218  *              specified by the given virtual memory object
 219  *              and offset.
 220  *      Additional arguments:
 221  *              The required permissions for the page is given
 222  *              in "fault_type".  Desired permissions are included
 223  *              in "protection".  The minimum and maximum valid offsets
 224  *              within the object for the relevant map entry are
 225  *              passed in "lo_offset" and "hi_offset" respectively and
 226  *              the expected page reference pattern is passed in "behavior".
 227  *              These three parameters are used to determine pagein cluster
 228  *              limits.
 229  *
 230  *              If the desired page is known to be resident (for
 231  *              example, because it was previously wired down), asserting
 232  *              the "unwiring" parameter will speed the search.
 233  *
 234  *              If the operation can be interrupted (by thread_abort
 235  *              or thread_terminate), then the "interruptible"
 236  *              parameter should be asserted.
 237  *
 238  *      Results:
 239  *              The page containing the proper data is returned
 240  *              in "result_page".
 241  *
 242  *      In/out conditions:
 243  *              The source object must be locked and referenced,
 244  *              and must donate one paging reference.  The reference
 245  *              is not affected.  The paging reference and lock are
 246  *              consumed.
 247  *
 248  *              If the call succeeds, the object in which "result_page"
 249  *              resides is left locked and holding a paging reference.
 250  *              If this is not the original object, a busy page in the
 251  *              original object is returned in "top_page", to prevent other
 252  *              callers from pursuing this same data, along with a paging
 253  *              reference for the original object.  The "top_page" should
 254  *              be destroyed when this guarantee is no longer required.
 255  *              The "result_page" is also left busy.  It is not removed
 256  *              from the pageout queues.
 257  */
 258
 259 vm_fault_return_t
 260 vm_fault_page(
 261         /* Arguments: */
 262         vm_object_t     first_object,   /* Object to begin search */
 263         vm_object_offset_t first_offset,        /* Offset into object */
 264         vm_prot_t       fault_type,     /* What access is requested */
 265         boolean_t       must_be_resident,/* Must page be resident? */
 266         int             interruptible,  /* how may fault be interrupted? */
 267         vm_object_offset_t lo_offset,   /* Map entry start */
 268         vm_object_offset_t hi_offset,   /* Map entry end */
 269         vm_behavior_t   behavior,       /* Page reference behavior */
 270         /* Modifies in place: */
 271         vm_prot_t       *protection,    /* Protection for mapping */
 272         /* Returns: */
 273         vm_page_t       *result_page,   /* Page found, if successful */
 274         vm_page_t       *top_page,      /* Page in top object, if
 275                                          * not result_page.  */
 276         int             *type_of_fault, /* if non-null, fill in with type of fault
 277                                          * COW, zero-fill, etc... returned in trace point */
 278         /* More arguments: */
 279         kern_return_t   *error_code,    /* code if page is in error */
 280         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 281         boolean_t       data_supply,    /* treat as data_supply if
 282                                          * it is a write fault and a full
 283                                          * page is provided */
 284         vm_map_t        map,
 285         vm_offset_t     vaddr)
 286 {
 287         register
 288         vm_page_t               m;
 289         register
 290         vm_object_t             object;
 291         register
 292         vm_object_offset_t      offset;
 293         vm_page_t               first_m;
 294         vm_object_t             next_object;
 295         vm_object_t             copy_object;
 296         boolean_t               look_for_page;
 297         vm_prot_t               access_required = fault_type;
 298         vm_prot_t               wants_copy_flag;
 299         vm_size_t               cluster_size, length;
 300         vm_object_offset_t      cluster_offset;
 301         vm_object_offset_t      cluster_start, cluster_end, paging_offset;
 302         vm_object_offset_t      align_offset;
 303         CLUSTER_STAT(int pages_at_higher_offsets;)
 304         CLUSTER_STAT(int pages_at_lower_offsets;)
 305         kern_return_t   wait_result;
 306         thread_t                cur_thread;
 307         boolean_t               interruptible_state;
 308         boolean_t               bumped_pagein = FALSE;
 309
 310
 311 #if     MACH_PAGEMAP
 312 /*
 313  * MACH page map - an optional optimization where a bit map is maintained
 314  * by the VM subsystem for internal objects to indicate which pages of
 315  * the object currently reside on backing store.  This existence map
 316  * duplicates information maintained by the vnode pager.  It is
 317  * created at the time of the first pageout against the object, i.e.
 318  * at the same time pager for the object is created.  The optimization
 319  * is designed to eliminate pager interaction overhead, if it is
 320  * 'known' that the page does not exist on backing store.
 321  *
 322  * LOOK_FOR() evaluates to TRUE if the page specified by object/offset is
 323  * either marked as paged out in the existence map for the object or no
 324  * existence map exists for the object.  LOOK_FOR() is one of the
 325  * criteria in the decision to invoke the pager.   It is also used as one
 326  * of the criteria to terminate the scan for adjacent pages in a clustered
 327  * pagein operation.  Note that LOOK_FOR() always evaluates to TRUE for
 328  * permanent objects.  Note also that if the pager for an internal object
 329  * has not been created, the pager is not invoked regardless of the value
 330  * of LOOK_FOR() and that clustered pagein scans are only done on an object
 331  * for which a pager has been created.
 332  *
 333  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 334  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 335  * PAGED_OUT() is used to determine if a page has already been pushed
 336  * into a copy object in order to avoid a redundant page out operation.
 337  */
 338 #define LOOK_FOR(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 339                         != VM_EXTERNAL_STATE_ABSENT)
 340 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 341                         == VM_EXTERNAL_STATE_EXISTS)
 342 #else /* MACH_PAGEMAP */
 343 /*
 344  * If the MACH page map optimization is not enabled,
 345  * LOOK_FOR() always evaluates to TRUE.  The pager will always be
 346  * invoked to resolve missing pages in an object, assuming the pager
 347  * has been created for the object.  In a clustered page operation, the
 348  * absence of a page on backing backing store cannot be used to terminate
 349  * a scan for adjacent pages since that information is available only in
 350  * the pager.  Hence pages that may not be paged out are potentially
 351  * included in a clustered request.  The vnode pager is coded to deal
 352  * with any combination of absent/present pages in a clustered
 353  * pagein request.  PAGED_OUT() always evaluates to FALSE, i.e. the pager
 354  * will always be invoked to push a dirty page into a copy object assuming
 355  * a pager has been created.  If the page has already been pushed, the
 356  * pager will ingore the new request.
 357  */
 358 #define LOOK_FOR(o, f) TRUE
 359 #define PAGED_OUT(o, f) FALSE
 360 #endif /* MACH_PAGEMAP */
 361
 362 /*
 363  *      Recovery actions
 364  */
 365 #define PREPARE_RELEASE_PAGE(m)                         \
 366         MACRO_BEGIN                                     \
 367         vm_page_lock_queues();                          \
 368         MACRO_END
 369
 370 #define DO_RELEASE_PAGE(m)                              \
 371         MACRO_BEGIN                                     \
 372         PAGE_WAKEUP_DONE(m);                            \
 373         if (!m->active && !m->inactive)                 \
 374                 vm_page_activate(m);                    \
 375         vm_page_unlock_queues();                        \
 376         MACRO_END
 377
 378 #define RELEASE_PAGE(m)                                 \
 379         MACRO_BEGIN                                     \
 380         PREPARE_RELEASE_PAGE(m);                        \
 381         DO_RELEASE_PAGE(m);                             \
 382         MACRO_END
 383
 384 #if TRACEFAULTPAGE
 385         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 386 #endif
 387
 388
 389
 390 #if     !VM_FAULT_STATIC_CONFIG
 391         if (vm_fault_dirty_handling
 392 #if     MACH_KDB
 393                 /*
 394                  *      If there are watchpoints set, then
 395                  *      we don't want to give away write permission
 396                  *      on a read fault.  Make the task write fault,
 397                  *      so that the watchpoint code notices the access.
 398                  */
 399             || db_watchpoint_list
 400 #endif  /* MACH_KDB */
 401             ) {
 402                 /*
 403                  *      If we aren't asking for write permission,
 404                  *      then don't give it away.  We're using write
 405                  *      faults to set the dirty bit.
 406                  */
 407                 if (!(fault_type & VM_PROT_WRITE))
 408                         *protection &= ~VM_PROT_WRITE;
 409         }
 410
 411         if (!vm_fault_interruptible)
 412                 interruptible = THREAD_UNINT;
 413 #else   /* STATIC_CONFIG */
 414 #if     MACH_KDB
 415                 /*
 416                  *      If there are watchpoints set, then
 417                  *      we don't want to give away write permission
 418                  *      on a read fault.  Make the task write fault,
 419                  *      so that the watchpoint code notices the access.
 420                  */
 421             if (db_watchpoint_list) {
 422                 /*
 423                  *      If we aren't asking for write permission,
 424                  *      then don't give it away.  We're using write
 425                  *      faults to set the dirty bit.
 426                  */
 427                 if (!(fault_type & VM_PROT_WRITE))
 428                         *protection &= ~VM_PROT_WRITE;
 429         }
 430
 431 #endif  /* MACH_KDB */
 432 #endif  /* STATIC_CONFIG */
 433
 434         cur_thread = current_thread();
 435
 436         interruptible_state = cur_thread->interruptible;
 437         if (interruptible == THREAD_UNINT)
 438                 cur_thread->interruptible = FALSE;
 439
 440         /*
 441          *      INVARIANTS (through entire routine):
 442          *
 443          *      1)      At all times, we must either have the object
 444          *              lock or a busy page in some object to prevent
 445          *              some other thread from trying to bring in
 446          *              the same page.
 447          *
 448          *              Note that we cannot hold any locks during the
 449          *              pager access or when waiting for memory, so
 450          *              we use a busy page then.
 451          *
 452          *              Note also that we aren't as concerned about more than
 453          *              one thread attempting to memory_object_data_unlock
 454          *              the same page at once, so we don't hold the page
 455          *              as busy then, but do record the highest unlock
 456          *              value so far.  [Unlock requests may also be delivered
 457          *              out of order.]
 458          *
 459          *      2)      To prevent another thread from racing us down the
 460          *              shadow chain and entering a new page in the top
 461          *              object before we do, we must keep a busy page in
 462          *              the top object while following the shadow chain.
 463          *
 464          *      3)      We must increment paging_in_progress on any object
 465          *              for which we have a busy page
 466          *
 467          *      4)      We leave busy pages on the pageout queues.
 468          *              If the pageout daemon comes across a busy page,
 469          *              it will remove the page from the pageout queues.
 470          */
 471
 472         /*
 473          *      Search for the page at object/offset.
 474          */
 475
 476         object = first_object;
 477         offset = first_offset;
 478         first_m = VM_PAGE_NULL;
 479         access_required = fault_type;
 480
 481         XPR(XPR_VM_FAULT,
 482                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 483                 (integer_t)object, offset, fault_type, *protection, 0);
 484
 485         /*
 486          *      See whether this page is resident
 487          */
 488
 489         while (TRUE) {
 490 #if TRACEFAULTPAGE
 491                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 492 #endif
 493                 if (!object->alive) {
 494                         vm_fault_cleanup(object, first_m);
 495                         cur_thread->interruptible = interruptible_state;
 496                         return(VM_FAULT_MEMORY_ERROR);
 497                 }
 498                 m = vm_page_lookup(object, offset);
 499 #if TRACEFAULTPAGE
 500                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 501 #endif
 502                 if (m != VM_PAGE_NULL) {
 503                         /*
 504                          *      If the page was pre-paged as part of a
 505                          *      cluster, record the fact.
 506                          */
 507                         if (m->clustered) {
 508                                 vm_pagein_cluster_used++;
 509                                 m->clustered = FALSE;
 510                         }
 511
 512                         /*
 513                          *      If the page is being brought in,
 514                          *      wait for it and then retry.
 515                          *
 516                          *      A possible optimization: if the page
 517                          *      is known to be resident, we can ignore
 518                          *      pages that are absent (regardless of
 519                          *      whether they're busy).
 520                          */
 521
 522                         if (m->busy) {
 523 #if TRACEFAULTPAGE
 524                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 525 #endif
 526                                 PAGE_ASSERT_WAIT(m, interruptible);
 527                                 vm_object_unlock(object);
 528                                 XPR(XPR_VM_FAULT,
 529                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 530                                         (integer_t)object, offset,
 531                                         (integer_t)m, 0, 0);
 532                                 counter(c_vm_fault_page_block_busy_kernel++);
 533                                 wait_result = thread_block((void (*)(void))0);
 534
 535                                 vm_object_lock(object);
 536                                 if (wait_result != THREAD_AWAKENED) {
 537                                         vm_fault_cleanup(object, first_m);
 538                                         cur_thread->interruptible = interruptible_state;
 539                                         if (wait_result == THREAD_RESTART)
 540                                           {
 541                                                 return(VM_FAULT_RETRY);
 542                                           }
 543                                         else
 544                                           {
 545                                                 return(VM_FAULT_INTERRUPTED);
 546                                           }
 547                                 }
 548                                 continue;
 549                         }
 550
 551                         /*
 552                          *      If the page is in error, give up now.
 553                          */
 554
 555                         if (m->error) {
 556 #if TRACEFAULTPAGE
 557                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
 558 #endif
 559                                 if (error_code)
 560                                         *error_code = m->page_error;
 561                                 VM_PAGE_FREE(m);
 562                                 vm_fault_cleanup(object, first_m);
 563                                 cur_thread->interruptible = interruptible_state;
 564                                 return(VM_FAULT_MEMORY_ERROR);
 565                         }
 566
 567                         /*
 568                          *      If the pager wants us to restart
 569                          *      at the top of the chain,
 570                          *      typically because it has moved the
 571                          *      page to another pager, then do so.
 572                          */
 573
 574                         if (m->restart) {
 575 #if TRACEFAULTPAGE
 576                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 577 #endif
 578                                 VM_PAGE_FREE(m);
 579                                 vm_fault_cleanup(object, first_m);
 580                                 cur_thread->interruptible = interruptible_state;
 581                                 return(VM_FAULT_RETRY);
 582                         }
 583
 584                         /*
 585                          *      If the page isn't busy, but is absent,
 586                          *      then it was deemed "unavailable".
 587                          */
 588
 589                         if (m->absent) {
 590                                 /*
 591                                  * Remove the non-existent page (unless it's
 592                                  * in the top object) and move on down to the
 593                                  * next object (if there is one).
 594                                  */
 595 #if TRACEFAULTPAGE
 596                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
 597 #endif
 598
 599                                 next_object = object->shadow;
 600                                 if (next_object == VM_OBJECT_NULL) {
 601                                         vm_page_t real_m;
 602
 603                                         assert(!must_be_resident);
 604
 605                                         if (object->shadow_severed) {
 606                                                 vm_fault_cleanup(
 607                                                         object, first_m);
 608                                                 cur_thread->interruptible = interruptible_state;
 609                                                 return VM_FAULT_MEMORY_ERROR;
 610                                         }
 611
 612                                         /*
 613                                          * Absent page at bottom of shadow
 614                                          * chain; zero fill the page we left
 615                                          * busy in the first object, and flush
 616                                          * the absent page.  But first we
 617                                          * need to allocate a real page.
 618                                          */
 619                                         if (VM_PAGE_THROTTLED() ||
 620                                             (real_m = vm_page_grab()) == VM_PAGE_NULL) {
 621                                                 vm_fault_cleanup(object, first_m);
 622                                                 cur_thread->interruptible = interruptible_state;
 623                                                 return(VM_FAULT_MEMORY_SHORTAGE);
 624                                         }
 625
 626                                         XPR(XPR_VM_FAULT,
 627               "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
 628                                                 (integer_t)object, offset,
 629                                                 (integer_t)m,
 630                                                 (integer_t)first_object, 0);
 631                                         if (object != first_object) {
 632                                                 VM_PAGE_FREE(m);
 633                                                 vm_object_paging_end(object);
 634                                                 vm_object_unlock(object);
 635                                                 object = first_object;
 636                                                 offset = first_offset;
 637                                                 m = first_m;
 638                                                 first_m = VM_PAGE_NULL;
 639                                                 vm_object_lock(object);
 640                                         }
 641
 642                                         VM_PAGE_FREE(m);
 643                                         assert(real_m->busy);
 644                                         vm_page_insert(real_m, object, offset);
 645                                         m = real_m;
 646
 647                                         /*
 648                                          *  Drop the lock while zero filling
 649                                          *  page.  Then break because this
 650                                          *  is the page we wanted.  Checking
 651                                          *  the page lock is a waste of time;
 652                                          *  this page was either absent or
 653                                          *  newly allocated -- in both cases
 654                                          *  it can't be page locked by a pager.
 655                                          */
 656                                         m->no_isync = FALSE;
 657
 658                                         if (!no_zero_fill) {
 659                                                 vm_object_unlock(object);
 660                                                 vm_page_zero_fill(m);
 661                                                 if (type_of_fault)
 662                                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
 663                                                 VM_STAT(zero_fill_count++);
 664
 665                                                 if (bumped_pagein == TRUE) {
 666                                                         VM_STAT(pageins--);
 667                                                         current_task()->pageins--;
 668                                                 }
 669                                                 vm_object_lock(object);
 670                                         }
 671                                         pmap_clear_modify(m->phys_addr);
 672                                         vm_page_lock_queues();
 673                                         VM_PAGE_QUEUES_REMOVE(m);
 674                                         m->page_ticket = vm_page_ticket;
 675                                         vm_page_ticket_roll++;
 676                                         if(vm_page_ticket_roll ==
 677                                                 VM_PAGE_TICKETS_IN_ROLL) {
 678                                                 vm_page_ticket_roll = 0;
 679                                                 if(vm_page_ticket ==
 680                                                      VM_PAGE_TICKET_ROLL_IDS)
 681                                                         vm_page_ticket= 0;
 682                                                 else
 683                                                         vm_page_ticket++;
 684                                         }
 685                                         queue_enter(&vm_page_queue_inactive,
 686                                                         m, vm_page_t, pageq);
 687                                         m->inactive = TRUE;
 688                                         vm_page_inactive_count++;
 689                                         vm_page_unlock_queues();
 690                                         break;
 691                                 } else {
 692                                         if (must_be_resident) {
 693                                                 vm_object_paging_end(object);
 694                                         } else if (object != first_object) {
 695                                                 vm_object_paging_end(object);
 696                                                 VM_PAGE_FREE(m);
 697                                         } else {
 698                                                 first_m = m;
 699                                                 m->absent = FALSE;
 700                                                 m->unusual = FALSE;
 701                                                 vm_object_absent_release(object);
 702                                                 m->busy = TRUE;
 703
 704                                                 vm_page_lock_queues();
 705                                                 VM_PAGE_QUEUES_REMOVE(m);
 706                                                 vm_page_unlock_queues();
 707                                         }
 708                                         XPR(XPR_VM_FAULT,
 709                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
 710                                                 (integer_t)object, offset,
 711                                                 (integer_t)next_object,
 712                                                 offset+object->shadow_offset,0);
 713                                         offset += object->shadow_offset;
 714                                         hi_offset += object->shadow_offset;
 715                                         lo_offset += object->shadow_offset;
 716                                         access_required = VM_PROT_READ;
 717                                         vm_object_lock(next_object);
 718                                         vm_object_unlock(object);
 719                                         object = next_object;
 720                                         vm_object_paging_begin(object);
 721                                         continue;
 722                                 }
 723                         }
 724
 725                         if ((m->cleaning)
 726                                 && ((object != first_object) ||
 727                                     (object->copy != VM_OBJECT_NULL))
 728                                 && (fault_type & VM_PROT_WRITE)) {
 729                                 /*
 730                                  * This is a copy-on-write fault that will
 731                                  * cause us to revoke access to this page, but
 732                                  * this page is in the process of being cleaned
 733                                  * in a clustered pageout. We must wait until
 734                                  * the cleaning operation completes before
 735                                  * revoking access to the original page,
 736                                  * otherwise we might attempt to remove a
 737                                  * wired mapping.
 738                                  */
 739 #if TRACEFAULTPAGE
 740                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
 741 #endif
 742                                 XPR(XPR_VM_FAULT,
 743                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
 744                                         (integer_t)object, offset,
 745                                         (integer_t)m, 0, 0);
 746                                 /* take an extra ref so that object won't die */
 747                                 assert(object->ref_count > 0);
 748                                 object->ref_count++;
 749                                 vm_object_res_reference(object);
 750                                 vm_fault_cleanup(object, first_m);
 751                                 counter(c_vm_fault_page_block_backoff_kernel++);
 752                                 vm_object_lock(object);
 753                                 assert(object->ref_count > 0);
 754                                 m = vm_page_lookup(object, offset);
 755                                 if (m != VM_PAGE_NULL && m->cleaning) {
 756                                         PAGE_ASSERT_WAIT(m, interruptible);
 757                                         vm_object_unlock(object);
 758                                         wait_result = thread_block((void (*)(void)) 0);
 759                                         vm_object_deallocate(object);
 760                                         goto backoff;
 761                                 } else {
 762                                         vm_object_unlock(object);
 763                                         vm_object_deallocate(object);
 764                                         cur_thread->interruptible = interruptible_state;
 765                                         return VM_FAULT_RETRY;
 766                                 }
 767                         }
 768
 769                         /*
 770                          *      If the desired access to this page has
 771                          *      been locked out, request that it be unlocked.
 772                          */
 773
 774                         if (access_required & m->page_lock) {
 775                                 if ((access_required & m->unlock_request) != access_required) {
 776                                         vm_prot_t       new_unlock_request;
 777                                         kern_return_t   rc;
 778
 779 #if TRACEFAULTPAGE
 780                                         dbgTrace(0xBEEF000A, (unsigned int) m, (unsigned int) object->pager_ready);     /* (TEST/DEBUG) */
 781 #endif
 782                                         if (!object->pager_ready) {
 783                                         XPR(XPR_VM_FAULT,
 784                                             "vm_f_page: ready wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 785                                                 access_required,
 786                                                 (integer_t)object, offset,
 787                                                 (integer_t)m, 0);
 788                                                 /* take an extra ref */
 789                                                 assert(object->ref_count > 0);
 790                                                 object->ref_count++;
 791                                                 vm_object_res_reference(object);
 792                                                 vm_fault_cleanup(object,
 793                                                                  first_m);
 794                                                 counter(c_vm_fault_page_block_backoff_kernel++);
 795                                                 vm_object_lock(object);
 796                                                 assert(object->ref_count > 0);
 797                                                 if (!object->pager_ready) {
 798                                                         vm_object_assert_wait(
 799                                                                 object,
 800                                                                 VM_OBJECT_EVENT_PAGER_READY,
 801                                                                 interruptible);
 802                                                         vm_object_unlock(object);
 803                                                         wait_result = thread_block((void (*)(void))0);
 804                                                         vm_object_deallocate(object);
 805                                                         goto backoff;
 806                                                 } else {
 807                                                         vm_object_unlock(object);
 808                                                         vm_object_deallocate(object);
 809                                                         cur_thread->interruptible = interruptible_state;
 810                                                         return VM_FAULT_RETRY;
 811                                                 }
 812                                         }
 813
 814                                         new_unlock_request = m->unlock_request =
 815                                                 (access_required | m->unlock_request);
 816                                         vm_object_unlock(object);
 817                                         XPR(XPR_VM_FAULT,
 818                                             "vm_f_page: unlock obj 0x%X, offset 0x%X, page 0x%X, unl_req %d\n",
 819                                         (integer_t)object, offset,
 820                                         (integer_t)m, new_unlock_request, 0);
 821                                         if ((rc = memory_object_data_unlock(
 822                                                 object->pager,
 823                                                 offset + object->paging_offset,
 824                                                 PAGE_SIZE,
 825                                                 new_unlock_request))
 826                                              != KERN_SUCCESS) {
 827                                                 if (vm_fault_debug)
 828                                                     printf("vm_fault: memory_object_data_unlock failed\n");
 829                                                 vm_object_lock(object);
 830                                                 vm_fault_cleanup(object, first_m);
 831                                                 cur_thread->interruptible = interruptible_state;
 832                                                 return((rc == MACH_SEND_INTERRUPTED) ?
 833                                                         VM_FAULT_INTERRUPTED :
 834                                                         VM_FAULT_MEMORY_ERROR);
 835                                         }
 836                                         vm_object_lock(object);
 837                                         continue;
 838                                 }
 839
 840                                 XPR(XPR_VM_FAULT,
 841         "vm_f_page: access wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 842                                         access_required, (integer_t)object,
 843                                         offset, (integer_t)m, 0);
 844                                 /* take an extra ref so object won't die */
 845                                 assert(object->ref_count > 0);
 846                                 object->ref_count++;
 847                                 vm_object_res_reference(object);
 848                                 vm_fault_cleanup(object, first_m);
 849                                 counter(c_vm_fault_page_block_backoff_kernel++);
 850                                 vm_object_lock(object);
 851                                 assert(object->ref_count > 0);
 852                                 m = vm_page_lookup(object, offset);
 853                                 if (m != VM_PAGE_NULL &&
 854                                     (access_required & m->page_lock) &&
 855                                     !((access_required & m->unlock_request) != access_required)) {
 856                                         PAGE_ASSERT_WAIT(m, interruptible);
 857                                         vm_object_unlock(object);
 858                                         wait_result = thread_block((void (*)(void)) 0);
 859                                         vm_object_deallocate(object);
 860                                         goto backoff;
 861                                 } else {
 862                                         vm_object_unlock(object);
 863                                         vm_object_deallocate(object);
 864                                         cur_thread->interruptible = interruptible_state;
 865                                         return VM_FAULT_RETRY;
 866                                 }
 867                         }
 868                         /*
 869                          *      We mark the page busy and leave it on
 870                          *      the pageout queues.  If the pageout
 871                          *      deamon comes across it, then it will
 872                          *      remove the page.
 873                          */
 874
 875 #if TRACEFAULTPAGE
 876                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 877 #endif
 878
 879 #if     !VM_FAULT_STATIC_CONFIG
 880                         if (!software_reference_bits) {
 881                                 vm_page_lock_queues();
 882                                 if (m->inactive)
 883                                         vm_stat.reactivations++;
 884
 885                                 VM_PAGE_QUEUES_REMOVE(m);
 886                                 vm_page_unlock_queues();
 887                         }
 888 #endif
 889                         XPR(XPR_VM_FAULT,
 890                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
 891                                 (integer_t)object, offset, (integer_t)m, 0, 0);
 892                         assert(!m->busy);
 893                         m->busy = TRUE;
 894                         assert(!m->absent);
 895                         break;
 896                 }
 897
 898                 look_for_page =
 899                         (object->pager_created) &&
 900                           LOOK_FOR(object, offset) &&
 901                             (!data_supply);
 902
 903 #if TRACEFAULTPAGE
 904                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
 905 #endif
 906                 if ((look_for_page || (object == first_object))
 907                                 && !must_be_resident
 908                                 && !(object->phys_contiguous))  {
 909                         /*
 910                          *      Allocate a new page for this object/offset
 911                          *      pair.
 912                          */
 913
 914                         m = vm_page_grab_fictitious();
 915 #if TRACEFAULTPAGE
 916                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 917 #endif
 918                         if (m == VM_PAGE_NULL) {
 919                                 vm_fault_cleanup(object, first_m);
 920                                 cur_thread->interruptible = interruptible_state;
 921                                 return(VM_FAULT_FICTITIOUS_SHORTAGE);
 922                         }
 923                         vm_page_insert(m, object, offset);
 924                 }
 925
 926                 if ((look_for_page && !must_be_resident)) {
 927                         kern_return_t   rc;
 928
 929                         /*
 930                          *      If the memory manager is not ready, we
 931                          *      cannot make requests.
 932                          */
 933                         if (!object->pager_ready) {
 934 #if TRACEFAULTPAGE
 935                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 936 #endif
 937                                 if(m != VM_PAGE_NULL)
 938                                         VM_PAGE_FREE(m);
 939                                 XPR(XPR_VM_FAULT,
 940                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
 941                                         (integer_t)object, offset, 0, 0, 0);
 942                                 /* take an extra ref so object won't die */
 943                                 assert(object->ref_count > 0);
 944                                 object->ref_count++;
 945                                 vm_object_res_reference(object);
 946                                 vm_fault_cleanup(object, first_m);
 947                                 counter(c_vm_fault_page_block_backoff_kernel++);
 948                                 vm_object_lock(object);
 949                                 assert(object->ref_count > 0);
 950                                 if (!object->pager_ready) {
 951                                         vm_object_assert_wait(object,
 952                                                               VM_OBJECT_EVENT_PAGER_READY,
 953                                                               interruptible);
 954                                         vm_object_unlock(object);
 955                                         wait_result = thread_block((void (*)(void))0);
 956                                         vm_object_deallocate(object);
 957                                         goto backoff;
 958                                 } else {
 959                                         vm_object_unlock(object);
 960                                         vm_object_deallocate(object);
 961                                         cur_thread->interruptible = interruptible_state;
 962                                         return VM_FAULT_RETRY;
 963                                 }
 964                         }
 965
 966                         if(object->phys_contiguous) {
 967                                 if(m != VM_PAGE_NULL) {
 968                                         VM_PAGE_FREE(m);
 969                                         m = VM_PAGE_NULL;
 970                                 }
 971                                 goto no_clustering;
 972                         }
 973                         if (object->internal) {
 974                                 /*
 975                                  *      Requests to the default pager
 976                                  *      must reserve a real page in advance,
 977                                  *      because the pager's data-provided
 978                                  *      won't block for pages.  IMPORTANT:
 979                                  *      this acts as a throttling mechanism
 980                                  *      for data_requests to the default
 981                                  *      pager.
 982                                  */
 983
 984 #if TRACEFAULTPAGE
 985                                 dbgTrace(0xBEEF000F, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 986 #endif
 987                                 if (m->fictitious && !vm_page_convert(m)) {
 988                                         VM_PAGE_FREE(m);
 989                                         vm_fault_cleanup(object, first_m);
 990                                         cur_thread->interruptible = interruptible_state;
 991                                         return(VM_FAULT_MEMORY_SHORTAGE);
 992                                 }
 993                         } else if (object->absent_count >
 994                                                 vm_object_absent_max) {
 995                                 /*
 996                                  *      If there are too many outstanding page
 997                                  *      requests pending on this object, we
 998                                  *      wait for them to be resolved now.
 999                                  */
1000
1001 #if TRACEFAULTPAGE
1002                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1003 #endif
1004                                 if(m != VM_PAGE_NULL)
1005                                         VM_PAGE_FREE(m);
1006                                 /* take an extra ref so object won't die */
1007                                 assert(object->ref_count > 0);
1008                                 object->ref_count++;
1009                                 vm_object_res_reference(object);
1010                                 vm_fault_cleanup(object, first_m);
1011                                 counter(c_vm_fault_page_block_backoff_kernel++);
1012                                 vm_object_lock(object);
1013                                 assert(object->ref_count > 0);
1014                                 if (object->absent_count > vm_object_absent_max) {
1015                                         vm_object_absent_assert_wait(object,
1016                                                                      interruptible);
1017                                         vm_object_unlock(object);
1018                                         wait_result = thread_block((void (*)(void))0);
1019                                         vm_object_deallocate(object);
1020                                         goto backoff;
1021                                 } else {
1022                                         vm_object_unlock(object);
1023                                         vm_object_deallocate(object);
1024                                         cur_thread->interruptible = interruptible_state;
1025                                         return VM_FAULT_RETRY;
1026                                 }
1027                         }
1028
1029                         /*
1030                          *      Indicate that the page is waiting for data
1031                          *      from the memory manager.
1032                          */
1033
1034                         if(m != VM_PAGE_NULL) {
1035
1036                                 m->list_req_pending = TRUE;
1037                                 m->absent = TRUE;
1038                                 m->unusual = TRUE;
1039                                 object->absent_count++;
1040
1041                         }
1042
1043                         cluster_start = offset;
1044                         length = PAGE_SIZE;
1045                         cluster_size = object->cluster_size;
1046
1047                         /*
1048                          * Skip clustered pagein if it is globally disabled
1049                          * or random page reference behavior is expected
1050                          * for the address range containing the faulting
1051                          * address or the object paging block size is
1052                          * equal to the page size.
1053                          */
1054                         if (!vm_allow_clustered_pagein ||
1055                              behavior == VM_BEHAVIOR_RANDOM ||
1056                              m == VM_PAGE_NULL ||
1057                              cluster_size == PAGE_SIZE) {
1058                                 cluster_start = trunc_page_64(cluster_start);
1059                                 goto no_clustering;
1060                         }
1061
1062                         assert(offset >= lo_offset);
1063                         assert(offset < hi_offset);
1064                         assert(ALIGNED(object->paging_offset));
1065                         assert(cluster_size >= PAGE_SIZE);
1066
1067 #if TRACEFAULTPAGE
1068                         dbgTrace(0xBEEF0011, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1069 #endif
1070                         /*
1071                          * Decide whether to scan ahead or behind for
1072                          * additional pages contiguous to the faulted
1073                          * page in the same paging block.  The decision
1074                          * is based on system wide globals and the
1075                          * expected page reference behavior of the
1076                          * address range contained the faulting address.
1077                          * First calculate some constants.
1078                          */
1079                         paging_offset = offset + object->paging_offset;
1080                         cluster_offset = paging_offset & (cluster_size - 1);
1081                         align_offset = paging_offset&(PAGE_SIZE_64-1);
1082                         if (align_offset != 0) {
1083                                 cluster_offset = trunc_page_64(cluster_offset);
1084                         }
1085
1086 #define SPANS_CLUSTER(x) ((((x) - align_offset) & (vm_object_offset_t)(cluster_size - 1)) == 0)
1087
1088                         /*
1089                          * Backward scan only if reverse sequential
1090                          * behavior has been specified
1091                          */
1092                         CLUSTER_STAT(pages_at_lower_offsets = 0;)
1093                         if (((vm_default_behind != 0 &&
1094                              behavior == VM_BEHAVIOR_DEFAULT) ||
1095                              behavior == VM_BEHAVIOR_RSEQNTL) && offset) {
1096                             vm_object_offset_t cluster_bot;
1097
1098                             /*
1099                              * Calculate lower search boundary.
1100                              * Exclude pages that span a cluster boundary.
1101                              * Clip to start of map entry.
1102                              * For default page reference behavior, scan
1103                              * default pages behind.
1104                              */
1105                             cluster_bot = (offset > cluster_offset) ?
1106                                             offset - cluster_offset : offset;
1107                             if (align_offset != 0) {
1108                                 if ((cluster_bot < offset) &&
1109                                     SPANS_CLUSTER(cluster_bot)) {
1110                                         cluster_bot += PAGE_SIZE_64;
1111                                 }
1112                             }
1113                             if (behavior == VM_BEHAVIOR_DEFAULT) {
1114                                 vm_object_offset_t
1115                                         bot = (vm_object_offset_t)
1116                                                 (vm_default_behind * PAGE_SIZE);
1117
1118                                 if (cluster_bot < (offset - bot))
1119                                         cluster_bot = offset - bot;
1120                             }
1121                             if (lo_offset > cluster_bot)
1122                                 cluster_bot = lo_offset;
1123
1124                             for ( cluster_start = offset - PAGE_SIZE_64;
1125                                  (cluster_start >= cluster_bot) &&
1126                                  (cluster_start !=
1127                                         (align_offset - PAGE_SIZE_64));
1128                                   cluster_start -= PAGE_SIZE_64) {
1129                                 assert(cluster_size > PAGE_SIZE_64);
1130 retry_cluster_backw:
1131                                 if (!LOOK_FOR(object, cluster_start) ||
1132                                     vm_page_lookup(object, cluster_start)
1133                                                 != VM_PAGE_NULL) {
1134                                         break;
1135                                 }
1136                                 if (object->internal) {
1137                                         /*
1138                                          * need to acquire a real page in
1139                                          * advance because this acts as
1140                                          * a throttling mechanism for
1141                                          * data_requests to the default
1142                                          * pager.  If this fails, give up
1143                                          * trying to find any more pages
1144                                          * in the cluster and send off the
1145                                          * request for what we already have.
1146                                          */
1147                                         if ((m = vm_page_grab())
1148                                                         == VM_PAGE_NULL) {
1149                                             cluster_start += PAGE_SIZE_64;
1150                                             cluster_end = offset + PAGE_SIZE_64;
1151                                             goto give_up;
1152                                         }
1153                                 } else if ((m = vm_page_grab_fictitious())
1154                                                 == VM_PAGE_NULL) {
1155                                         vm_object_unlock(object);
1156                                         vm_page_more_fictitious();
1157                                         vm_object_lock(object);
1158                                         goto retry_cluster_backw;
1159                                 }
1160                                 m->absent = TRUE;
1161                                 m->unusual = TRUE;
1162                                 m->clustered = TRUE;
1163                                 m->list_req_pending = TRUE;
1164
1165                                 vm_page_insert(m, object, cluster_start);
1166                                 CLUSTER_STAT(pages_at_lower_offsets++;)
1167                                 object->absent_count++;
1168                             }
1169                             cluster_start += PAGE_SIZE_64;
1170                             assert(cluster_start >= cluster_bot);
1171                         }
1172                         assert(cluster_start <= offset);
1173
1174                         /*
1175                          * Forward scan if default or sequential behavior
1176                          * specified
1177                          */
1178                         CLUSTER_STAT(pages_at_higher_offsets = 0;)
1179                         if ((behavior == VM_BEHAVIOR_DEFAULT &&
1180                              vm_default_ahead != 0) ||
1181                              behavior == VM_BEHAVIOR_SEQUENTIAL) {
1182                             vm_object_offset_t cluster_top;
1183
1184                             /*
1185                              * Calculate upper search boundary.
1186                              * Exclude pages that span a cluster boundary.
1187                              * Clip to end of map entry.
1188                              * For default page reference behavior, scan
1189                              * default pages ahead.
1190                              */
1191                             cluster_top = (offset + cluster_size) -
1192                                           cluster_offset;
1193                             if (align_offset != 0) {
1194                                 if ((cluster_top > (offset + PAGE_SIZE_64)) &&
1195                                     SPANS_CLUSTER(cluster_top)) {
1196                                         cluster_top -= PAGE_SIZE_64;
1197                                 }
1198                             }
1199                             if (behavior == VM_BEHAVIOR_DEFAULT) {
1200                                 vm_object_offset_t top = (vm_object_offset_t)
1201                                      ((vm_default_ahead*PAGE_SIZE)+PAGE_SIZE);
1202
1203                                 if (cluster_top > (offset + top))
1204                                         cluster_top =  offset + top;
1205                             }
1206                             if (cluster_top > hi_offset)
1207                                         cluster_top = hi_offset;
1208
1209                             for (cluster_end = offset + PAGE_SIZE_64;
1210                                  cluster_end < cluster_top;
1211                                  cluster_end += PAGE_SIZE_64) {
1212                                 assert(cluster_size > PAGE_SIZE);
1213 retry_cluster_forw:
1214                                 if (!LOOK_FOR(object, cluster_end) ||
1215                                     vm_page_lookup(object, cluster_end)
1216                                                 != VM_PAGE_NULL) {
1217                                         break;
1218                                 }
1219                                 if (object->internal) {
1220                                         /*
1221                                          * need to acquire a real page in
1222                                          * advance because this acts as
1223                                          * a throttling mechanism for
1224                                          * data_requests to the default
1225                                          * pager.  If this fails, give up
1226                                          * trying to find any more pages
1227                                          * in the cluster and send off the
1228                                          * request for what we already have.
1229                                          */
1230                                         if ((m = vm_page_grab())
1231                                                         == VM_PAGE_NULL) {
1232                                             break;
1233                                         }
1234                                 } else if ((m = vm_page_grab_fictitious())
1235                                                 == VM_PAGE_NULL) {
1236                                     vm_object_unlock(object);
1237                                     vm_page_more_fictitious();
1238                                     vm_object_lock(object);
1239                                     goto retry_cluster_forw;
1240                                 }
1241                                 m->absent = TRUE;
1242                                 m->unusual = TRUE;
1243                                 m->clustered = TRUE;
1244                                 m->list_req_pending = TRUE;
1245
1246                                 vm_page_insert(m, object, cluster_end);
1247                                 CLUSTER_STAT(pages_at_higher_offsets++;)
1248                                 object->absent_count++;
1249                             }
1250                             assert(cluster_end <= cluster_top);
1251                         }
1252                         else {
1253                                 cluster_end = offset + PAGE_SIZE_64;
1254                         }
1255 give_up:
1256                         assert(cluster_end >= offset + PAGE_SIZE_64);
1257                         length = cluster_end - cluster_start;
1258
1259 #if     MACH_CLUSTER_STATS
1260                         CLUSTER_STAT_HIGHER(pages_at_higher_offsets);
1261                         CLUSTER_STAT_LOWER(pages_at_lower_offsets);
1262                         CLUSTER_STAT_CLUSTER(length/PAGE_SIZE);
1263 #endif  /* MACH_CLUSTER_STATS */
1264
1265 no_clustering:
1266                         /*
1267                          * lengthen the cluster by the pages in the working set
1268                          */
1269                         if((map != NULL) &&
1270                                 (current_task()->dynamic_working_set != 0)) {
1271                                 cluster_end = cluster_start + length;
1272                                 /* tws values for start and end are just a
1273                                  * suggestions.  Therefore, as long as
1274                                  * build_cluster does not use pointers or
1275                                  * take action based on values that
1276                                  * could be affected by re-entrance we
1277                                  * do not need to take the map lock.
1278                                  */
1279                                 tws_build_cluster((tws_hash_t)
1280                                         current_task()->dynamic_working_set,
1281                                         object, &cluster_start,
1282                                         &cluster_end, 0x16000);
1283                                 length = cluster_end - cluster_start;
1284                         }
1285 #if TRACEFAULTPAGE
1286                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1287 #endif
1288                         /*
1289                          *      We have a busy page, so we can
1290                          *      release the object lock.
1291                          */
1292                         vm_object_unlock(object);
1293
1294                         /*
1295                          *      Call the memory manager to retrieve the data.
1296                          */
1297
1298                         if (type_of_fault)
1299                                 *type_of_fault = DBG_PAGEIN_FAULT;
1300                         VM_STAT(pageins++);
1301                         current_task()->pageins++;
1302                         bumped_pagein = TRUE;
1303
1304                         /*
1305                          *      If this object uses a copy_call strategy,
1306                          *      and we are interested in a copy of this object
1307                          *      (having gotten here only by following a
1308                          *      shadow chain), then tell the memory manager
1309                          *      via a flag added to the desired_access
1310                          *      parameter, so that it can detect a race
1311                          *      between our walking down the shadow chain
1312                          *      and its pushing pages up into a copy of
1313                          *      the object that it manages.
1314                          */
1315
1316                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL &&
1317                             object != first_object) {
1318                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1319                         } else {
1320                                 wants_copy_flag = VM_PROT_NONE;
1321                         }
1322
1323                         XPR(XPR_VM_FAULT,
1324                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1325                                 (integer_t)object, offset, (integer_t)m,
1326                                 access_required | wants_copy_flag, 0);
1327
1328                         rc = memory_object_data_request(object->pager,
1329                                         cluster_start + object->paging_offset,
1330                                         length,
1331                                         access_required | wants_copy_flag);
1332
1333
1334 #if TRACEFAULTPAGE
1335                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1336 #endif
1337                         if (rc != KERN_SUCCESS) {
1338                                 if (rc != MACH_SEND_INTERRUPTED
1339                                     && vm_fault_debug)
1340                                         printf("%s(0x%x, 0x%x, 0x%x, 0x%x) failed, rc=%d\n",
1341                                                 "memory_object_data_request",
1342                                                 object->pager,
1343                                                 cluster_start + object->paging_offset,
1344                                                 length, access_required, rc);
1345                                 /*
1346                                  *      Don't want to leave a busy page around,
1347                                  *      but the data request may have blocked,
1348                                  *      so check if it's still there and busy.
1349                                  */
1350                                 if(!object->phys_contiguous) {
1351                                    vm_object_lock(object);
1352                                    for (; length; length -= PAGE_SIZE,
1353                                       cluster_start += PAGE_SIZE_64) {
1354                                       vm_page_t p;
1355                                       if ((p = vm_page_lookup(object,
1356                                                                 cluster_start))
1357                                             && p->absent && p->busy
1358                                             && p != first_m) {
1359                                          VM_PAGE_FREE(p);
1360                                       }
1361                                    }
1362                                 }
1363                                 vm_fault_cleanup(object, first_m);
1364                                 cur_thread->interruptible = interruptible_state;
1365                                 return((rc == MACH_SEND_INTERRUPTED) ?
1366                                         VM_FAULT_INTERRUPTED :
1367                                         VM_FAULT_MEMORY_ERROR);
1368                         } else {
1369 #ifdef notdefcdy
1370                                 tws_hash_line_t line;
1371                                 task_t          task;
1372
1373                                 task = current_task();
1374
1375                                 if((map != NULL) &&
1376                                         (task->dynamic_working_set != 0)) {
1377                                         if(tws_lookup
1378                                                 ((tws_hash_t)
1379                                                 task->dynamic_working_set,
1380                                                 offset, object,
1381                                                 &line) == KERN_SUCCESS) {
1382                                                 tws_line_signal((tws_hash_t)
1383                                                 task->dynamic_working_set,
1384                                                         map, line, vaddr);
1385                                         }
1386                                 }
1387 #endif
1388                         }
1389
1390                         /*
1391                          * Retry with same object/offset, since new data may
1392                          * be in a different page (i.e., m is meaningless at
1393                          * this point).
1394                          */
1395                         vm_object_lock(object);
1396                         if ((interruptible != THREAD_UNINT) &&
1397                             (current_thread()->state & TH_ABORT)) {
1398                                 vm_fault_cleanup(object, first_m);
1399                                 cur_thread->interruptible = interruptible_state;
1400                                 return(VM_FAULT_INTERRUPTED);
1401                         }
1402                         if(m == VM_PAGE_NULL)
1403                                 break;
1404                         continue;
1405                 }
1406
1407                 /*
1408                  * The only case in which we get here is if
1409                  * object has no pager (or unwiring).  If the pager doesn't
1410                  * have the page this is handled in the m->absent case above
1411                  * (and if you change things here you should look above).
1412                  */
1413 #if TRACEFAULTPAGE
1414                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1415 #endif
1416                 if (object == first_object)
1417                         first_m = m;
1418                 else
1419                         assert(m == VM_PAGE_NULL);
1420
1421                 XPR(XPR_VM_FAULT,
1422                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1423                         (integer_t)object, offset, (integer_t)m,
1424                         (integer_t)object->shadow, 0);
1425                 /*
1426                  *      Move on to the next object.  Lock the next
1427                  *      object before unlocking the current one.
1428                  */
1429                 next_object = object->shadow;
1430                 if (next_object == VM_OBJECT_NULL) {
1431                         assert(!must_be_resident);
1432                         /*
1433                          *      If there's no object left, fill the page
1434                          *      in the top object with zeros.  But first we
1435                          *      need to allocate a real page.
1436                          */
1437
1438                         if (object != first_object) {
1439                                 vm_object_paging_end(object);
1440                                 vm_object_unlock(object);
1441
1442                                 object = first_object;
1443                                 offset = first_offset;
1444                                 vm_object_lock(object);
1445                         }
1446
1447                         m = first_m;
1448                         assert(m->object == object);
1449                         first_m = VM_PAGE_NULL;
1450
1451                         if (object->shadow_severed) {
1452                                 VM_PAGE_FREE(m);
1453                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1454                                 cur_thread->interruptible = interruptible_state;
1455                                 return VM_FAULT_MEMORY_ERROR;
1456                         }
1457
1458                         if (VM_PAGE_THROTTLED() ||
1459                             (m->fictitious && !vm_page_convert(m))) {
1460                                 VM_PAGE_FREE(m);
1461                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1462                                 cur_thread->interruptible = interruptible_state;
1463                                 return(VM_FAULT_MEMORY_SHORTAGE);
1464                         }
1465                         m->no_isync = FALSE;
1466
1467                         if (!no_zero_fill) {
1468                                 vm_object_unlock(object);
1469                                 vm_page_zero_fill(m);
1470                                 if (type_of_fault)
1471                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
1472                                 VM_STAT(zero_fill_count++);
1473
1474                                 if (bumped_pagein == TRUE) {
1475                                         VM_STAT(pageins--);
1476                                         current_task()->pageins--;
1477                                 }
1478                                 vm_object_lock(object);
1479                         }
1480                         vm_page_lock_queues();
1481                         VM_PAGE_QUEUES_REMOVE(m);
1482                         m->page_ticket = vm_page_ticket;
1483                         vm_page_ticket_roll++;
1484                         if(vm_page_ticket_roll == VM_PAGE_TICKETS_IN_ROLL) {
1485                                 vm_page_ticket_roll = 0;
1486                                 if(vm_page_ticket ==
1487                                         VM_PAGE_TICKET_ROLL_IDS)
1488                                         vm_page_ticket= 0;
1489                                 else
1490                                         vm_page_ticket++;
1491                         }
1492                         queue_enter(&vm_page_queue_inactive,
1493                                                 m, vm_page_t, pageq);
1494                         m->inactive = TRUE;
1495                         vm_page_inactive_count++;
1496                         vm_page_unlock_queues();
1497                         pmap_clear_modify(m->phys_addr);
1498                         break;
1499                 }
1500                 else {
1501                         if ((object != first_object) || must_be_resident)
1502                                 vm_object_paging_end(object);
1503                         offset += object->shadow_offset;
1504                         hi_offset += object->shadow_offset;
1505                         lo_offset += object->shadow_offset;
1506                         access_required = VM_PROT_READ;
1507                         vm_object_lock(next_object);
1508                         vm_object_unlock(object);
1509                         object = next_object;
1510                         vm_object_paging_begin(object);
1511                 }
1512         }
1513
1514         /*
1515          *      PAGE HAS BEEN FOUND.
1516          *
1517          *      This page (m) is:
1518          *              busy, so that we can play with it;
1519          *              not absent, so that nobody else will fill it;
1520          *              possibly eligible for pageout;
1521          *
1522          *      The top-level page (first_m) is:
1523          *              VM_PAGE_NULL if the page was found in the
1524          *               top-level object;
1525          *              busy, not absent, and ineligible for pageout.
1526          *
1527          *      The current object (object) is locked.  A paging
1528          *      reference is held for the current and top-level
1529          *      objects.
1530          */
1531
1532 #if TRACEFAULTPAGE
1533         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1534 #endif
1535 #if     EXTRA_ASSERTIONS
1536         if(m != VM_PAGE_NULL) {
1537                 assert(m->busy && !m->absent);
1538                 assert((first_m == VM_PAGE_NULL) ||
1539                         (first_m->busy && !first_m->absent &&
1540                          !first_m->active && !first_m->inactive));
1541         }
1542 #endif  /* EXTRA_ASSERTIONS */
1543
1544         XPR(XPR_VM_FAULT,
1545        "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1546                 (integer_t)object, offset, (integer_t)m,
1547                 (integer_t)first_object, (integer_t)first_m);
1548         /*
1549          *      If the page is being written, but isn't
1550          *      already owned by the top-level object,
1551          *      we have to copy it into a new page owned
1552          *      by the top-level object.
1553          */
1554
1555         if ((object != first_object) && (m != VM_PAGE_NULL)) {
1556                 /*
1557                  *      We only really need to copy if we
1558                  *      want to write it.
1559                  */
1560
1561 #if TRACEFAULTPAGE
1562                         dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1563 #endif
1564                 if (fault_type & VM_PROT_WRITE) {
1565                         vm_page_t copy_m;
1566
1567                         assert(!must_be_resident);
1568
1569                         /*
1570                          *      If we try to collapse first_object at this
1571                          *      point, we may deadlock when we try to get
1572                          *      the lock on an intermediate object (since we
1573                          *      have the bottom object locked).  We can't
1574                          *      unlock the bottom object, because the page
1575                          *      we found may move (by collapse) if we do.
1576                          *
1577                          *      Instead, we first copy the page.  Then, when
1578                          *      we have no more use for the bottom object,
1579                          *      we unlock it and try to collapse.
1580                          *
1581                          *      Note that we copy the page even if we didn't
1582                          *      need to... that's the breaks.
1583                          */
1584
1585                         /*
1586                          *      Allocate a page for the copy
1587                          */
1588                         copy_m = vm_page_grab();
1589                         if (copy_m == VM_PAGE_NULL) {
1590                                 RELEASE_PAGE(m);
1591                                 vm_fault_cleanup(object, first_m);
1592                                 cur_thread->interruptible = interruptible_state;
1593                                 return(VM_FAULT_MEMORY_SHORTAGE);
1594                         }
1595
1596
1597                         XPR(XPR_VM_FAULT,
1598                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1599                                 (integer_t)object, offset,
1600                                 (integer_t)m, (integer_t)copy_m, 0);
1601                         vm_page_copy(m, copy_m);
1602
1603                         /*
1604                          *      If another map is truly sharing this
1605                          *      page with us, we have to flush all
1606                          *      uses of the original page, since we
1607                          *      can't distinguish those which want the
1608                          *      original from those which need the
1609                          *      new copy.
1610                          *
1611                          *      XXXO If we know that only one map has
1612                          *      access to this page, then we could
1613                          *      avoid the pmap_page_protect() call.
1614                          */
1615
1616                         vm_page_lock_queues();
1617                         assert(!m->cleaning);
1618                         pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1619                         vm_page_deactivate(m);
1620                         copy_m->dirty = TRUE;
1621                         /*
1622                          * Setting reference here prevents this fault from
1623                          * being counted as a (per-thread) reactivate as well
1624                          * as a copy-on-write.
1625                          */
1626                         first_m->reference = TRUE;
1627                         vm_page_unlock_queues();
1628
1629                         /*
1630                          *      We no longer need the old page or object.
1631                          */
1632
1633                         PAGE_WAKEUP_DONE(m);
1634                         vm_object_paging_end(object);
1635                         vm_object_unlock(object);
1636
1637                         if (type_of_fault)
1638                                 *type_of_fault = DBG_COW_FAULT;
1639                         VM_STAT(cow_faults++);
1640                         current_task()->cow_faults++;
1641                         object = first_object;
1642                         offset = first_offset;
1643
1644                         vm_object_lock(object);
1645                         VM_PAGE_FREE(first_m);
1646                         first_m = VM_PAGE_NULL;
1647                         assert(copy_m->busy);
1648                         vm_page_insert(copy_m, object, offset);
1649                         m = copy_m;
1650
1651                         /*
1652                          *      Now that we've gotten the copy out of the
1653                          *      way, let's try to collapse the top object.
1654                          *      But we have to play ugly games with
1655                          *      paging_in_progress to do that...
1656                          */
1657
1658                         vm_object_paging_end(object);
1659                         vm_object_collapse(object);
1660                         vm_object_paging_begin(object);
1661
1662                 }
1663                 else {
1664                         *protection &= (~VM_PROT_WRITE);
1665                 }
1666         }
1667
1668         /*
1669          *      Now check whether the page needs to be pushed into the
1670          *      copy object.  The use of asymmetric copy on write for
1671          *      shared temporary objects means that we may do two copies to
1672          *      satisfy the fault; one above to get the page from a
1673          *      shadowed object, and one here to push it into the copy.
1674          */
1675
1676         while (first_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
1677                (copy_object = first_object->copy) != VM_OBJECT_NULL &&
1678                    (m!= VM_PAGE_NULL)) {
1679                 vm_object_offset_t      copy_offset;
1680                 vm_page_t               copy_m;
1681
1682 #if TRACEFAULTPAGE
1683                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1684 #endif
1685                 /*
1686                  *      If the page is being written, but hasn't been
1687                  *      copied to the copy-object, we have to copy it there.
1688                  */
1689
1690                 if ((fault_type & VM_PROT_WRITE) == 0) {
1691                         *protection &= ~VM_PROT_WRITE;
1692                         break;
1693                 }
1694
1695                 /*
1696                  *      If the page was guaranteed to be resident,
1697                  *      we must have already performed the copy.
1698                  */
1699
1700                 if (must_be_resident)
1701                         break;
1702
1703                 /*
1704                  *      Try to get the lock on the copy_object.
1705                  */
1706                 if (!vm_object_lock_try(copy_object)) {
1707                         vm_object_unlock(object);
1708
1709                         mutex_pause();  /* wait a bit */
1710
1711                         vm_object_lock(object);
1712                         continue;
1713                 }
1714
1715                 /*
1716                  *      Make another reference to the copy-object,
1717                  *      to keep it from disappearing during the
1718                  *      copy.
1719                  */
1720                 assert(copy_object->ref_count > 0);
1721                 copy_object->ref_count++;
1722                 VM_OBJ_RES_INCR(copy_object);
1723
1724                 /*
1725                  *      Does the page exist in the copy?
1726                  */
1727                 copy_offset = first_offset - copy_object->shadow_offset;
1728                 if (copy_object->size <= copy_offset)
1729                         /*
1730                          * Copy object doesn't cover this page -- do nothing.
1731                          */
1732                         ;
1733                 else if ((copy_m =
1734                         vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1735                         /* Page currently exists in the copy object */
1736                         if (copy_m->busy) {
1737                                 /*
1738                                  *      If the page is being brought
1739                                  *      in, wait for it and then retry.
1740                                  */
1741                                 RELEASE_PAGE(m);
1742                                 /* take an extra ref so object won't die */
1743                                 assert(copy_object->ref_count > 0);
1744                                 copy_object->ref_count++;
1745                                 vm_object_res_reference(copy_object);
1746                                 vm_object_unlock(copy_object);
1747                                 vm_fault_cleanup(object, first_m);
1748                                 counter(c_vm_fault_page_block_backoff_kernel++);
1749                                 vm_object_lock(copy_object);
1750                                 assert(copy_object->ref_count > 0);
1751                                 VM_OBJ_RES_DECR(copy_object);
1752                                 copy_object->ref_count--;
1753                                 assert(copy_object->ref_count > 0);
1754                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1755                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1756                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1757                                         vm_object_unlock(copy_object);
1758                                         wait_result = thread_block((void (*)(void))0);
1759                                         vm_object_deallocate(copy_object);
1760                                         goto backoff;
1761                                 } else {
1762                                         vm_object_unlock(copy_object);
1763                                         vm_object_deallocate(copy_object);
1764                                         cur_thread->interruptible = interruptible_state;
1765                                         return VM_FAULT_RETRY;
1766                                 }
1767                         }
1768                 }
1769                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1770                         /*
1771                          * If PAGED_OUT is TRUE, then the page used to exist
1772                          * in the copy-object, and has already been paged out.
1773                          * We don't need to repeat this. If PAGED_OUT is
1774                          * FALSE, then either we don't know (!pager_created,
1775                          * for example) or it hasn't been paged out.
1776                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1777                          * We must copy the page to the copy object.
1778                          */
1779
1780                         /*
1781                          *      Allocate a page for the copy
1782                          */
1783                         copy_m = vm_page_alloc(copy_object, copy_offset);
1784                         if (copy_m == VM_PAGE_NULL) {
1785                                 RELEASE_PAGE(m);
1786                                 VM_OBJ_RES_DECR(copy_object);
1787                                 copy_object->ref_count--;
1788                                 assert(copy_object->ref_count > 0);
1789                                 vm_object_unlock(copy_object);
1790                                 vm_fault_cleanup(object, first_m);
1791                                 cur_thread->interruptible = interruptible_state;
1792                                 return(VM_FAULT_MEMORY_SHORTAGE);
1793                         }
1794
1795                         /*
1796                          *      Must copy page into copy-object.
1797                          */
1798
1799                         vm_page_copy(m, copy_m);
1800
1801                         /*
1802                          *      If the old page was in use by any users
1803                          *      of the copy-object, it must be removed
1804                          *      from all pmaps.  (We can't know which
1805                          *      pmaps use it.)
1806                          */
1807
1808                         vm_page_lock_queues();
1809                         assert(!m->cleaning);
1810                         pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1811                         copy_m->dirty = TRUE;
1812                         vm_page_unlock_queues();
1813
1814                         /*
1815                          *      If there's a pager, then immediately
1816                          *      page out this page, using the "initialize"
1817                          *      option.  Else, we use the copy.
1818                          */
1819
1820                         if
1821 #if     MACH_PAGEMAP
1822                           ((!copy_object->pager_created) ||
1823                                 vm_external_state_get(
1824                                         copy_object->existence_map, copy_offset)
1825                                 == VM_EXTERNAL_STATE_ABSENT)
1826 #else
1827                           (!copy_object->pager_created)
1828 #endif
1829                                 {
1830                                 vm_page_lock_queues();
1831                                 vm_page_activate(copy_m);
1832                                 vm_page_unlock_queues();
1833                                 PAGE_WAKEUP_DONE(copy_m);
1834                         }
1835                         else {
1836                                 assert(copy_m->busy == TRUE);
1837
1838                                 /*
1839                                  *      The page is already ready for pageout:
1840                                  *      not on pageout queues and busy.
1841                                  *      Unlock everything except the
1842                                  *      copy_object itself.
1843                                  */
1844
1845                                 vm_object_unlock(object);
1846
1847                                 /*
1848                                  *      Write the page to the copy-object,
1849                                  *      flushing it from the kernel.
1850                                  */
1851
1852                                 vm_pageout_initialize_page(copy_m);
1853
1854                                 /*
1855                                  *      Since the pageout may have
1856                                  *      temporarily dropped the
1857                                  *      copy_object's lock, we
1858                                  *      check whether we'll have
1859                                  *      to deallocate the hard way.
1860                                  */
1861
1862                                 if ((copy_object->shadow != object) ||
1863                                     (copy_object->ref_count == 1)) {
1864                                         vm_object_unlock(copy_object);
1865                                         vm_object_deallocate(copy_object);
1866                                         vm_object_lock(object);
1867                                         continue;
1868                                 }
1869
1870                                 /*
1871                                  *      Pick back up the old object's
1872                                  *      lock.  [It is safe to do so,
1873                                  *      since it must be deeper in the
1874                                  *      object tree.]
1875                                  */
1876
1877                                 vm_object_lock(object);
1878                         }
1879
1880                         /*
1881                          *      Because we're pushing a page upward
1882                          *      in the object tree, we must restart
1883                          *      any faults that are waiting here.
1884                          *      [Note that this is an expansion of
1885                          *      PAGE_WAKEUP that uses the THREAD_RESTART
1886                          *      wait result].  Can't turn off the page's
1887                          *      busy bit because we're not done with it.
1888                          */
1889
1890                         if (m->wanted) {
1891                                 m->wanted = FALSE;
1892                                 thread_wakeup_with_result((event_t) m,
1893                                         THREAD_RESTART);
1894                         }
1895                 }
1896
1897                 /*
1898                  *      The reference count on copy_object must be
1899                  *      at least 2: one for our extra reference,
1900                  *      and at least one from the outside world
1901                  *      (we checked that when we last locked
1902                  *      copy_object).
1903                  */
1904                 copy_object->ref_count--;
1905                 assert(copy_object->ref_count > 0);
1906                 VM_OBJ_RES_DECR(copy_object);
1907                 vm_object_unlock(copy_object);
1908
1909                 break;
1910         }
1911
1912         *result_page = m;
1913         *top_page = first_m;
1914
1915         XPR(XPR_VM_FAULT,
1916                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1917                 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1918         /*
1919          *      If the page can be written, assume that it will be.
1920          *      [Earlier, we restrict the permission to allow write
1921          *      access only if the fault so required, so we don't
1922          *      mark read-only data as dirty.]
1923          */
1924
1925 #if     !VM_FAULT_STATIC_CONFIG
1926         if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE) &&
1927                         (m != VM_PAGE_NULL)) {
1928                 m->dirty = TRUE;
1929         }
1930 #endif
1931 #if TRACEFAULTPAGE
1932         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_page_deactivate_behind);  /* (TEST/DEBUG) */
1933 #endif
1934         if (vm_page_deactivate_behind) {
1935                 if (offset && /* don't underflow */
1936                         (object->last_alloc == (offset - PAGE_SIZE_64))) {
1937                         m = vm_page_lookup(object, object->last_alloc);
1938                         if ((m != VM_PAGE_NULL) && !m->busy) {
1939                                 vm_page_lock_queues();
1940                                 vm_page_deactivate(m);
1941                                 vm_page_unlock_queues();
1942                         }
1943 #if TRACEFAULTPAGE
1944                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1945 #endif
1946                 }
1947                 object->last_alloc = offset;
1948         }
1949 #if TRACEFAULTPAGE
1950         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
1951 #endif
1952         cur_thread->interruptible = interruptible_state;
1953         if(*result_page == VM_PAGE_NULL) {
1954                 vm_object_unlock(object);
1955         }
1956         return(VM_FAULT_SUCCESS);
1957
1958 #if 0
1959     block_and_backoff:
1960         vm_fault_cleanup(object, first_m);
1961
1962         counter(c_vm_fault_page_block_backoff_kernel++);
1963         thread_block((void (*)(void))0);
1964 #endif
1965
1966     backoff:
1967         cur_thread->interruptible = interruptible_state;
1968         if (wait_result == THREAD_INTERRUPTED)
1969                 return VM_FAULT_INTERRUPTED;
1970         return VM_FAULT_RETRY;
1971
1972 #undef  RELEASE_PAGE
1973 }
1974
1975 /*
1976  *      Routine:        vm_fault
1977  *      Purpose:
1978  *              Handle page faults, including pseudo-faults
1979  *              used to change the wiring status of pages.
1980  *      Returns:
1981  *              Explicit continuations have been removed.
1982  *      Implementation:
1983  *              vm_fault and vm_fault_page save mucho state
1984  *              in the moral equivalent of a closure.  The state
1985  *              structure is allocated when first entering vm_fault
1986  *              and deallocated when leaving vm_fault.
1987  */
1988
1989 kern_return_t
1990 vm_fault(
1991         vm_map_t        map,
1992         vm_offset_t     vaddr,
1993         vm_prot_t       fault_type,
1994         boolean_t       change_wiring,
1995         int             interruptible)
1996 {
1997         vm_map_version_t        version;        /* Map version for verificiation */
1998         boolean_t               wired;          /* Should mapping be wired down? */
1999         vm_object_t             object;         /* Top-level object */
2000         vm_object_offset_t      offset;         /* Top-level offset */
2001         vm_prot_t               prot;           /* Protection for mapping */
2002         vm_behavior_t           behavior;       /* Expected paging behavior */
2003         vm_object_offset_t      lo_offset, hi_offset;
2004         vm_object_t             old_copy_object; /* Saved copy object */
2005         vm_page_t               result_page;    /* Result of vm_fault_page */
2006         vm_page_t               top_page;       /* Placeholder page */
2007         kern_return_t           kr;
2008
2009         register
2010         vm_page_t               m;      /* Fast access to result_page */
2011         kern_return_t           error_code;     /* page error reasons */
2012         register
2013         vm_object_t             cur_object;
2014         register
2015         vm_object_offset_t      cur_offset;
2016         vm_page_t               cur_m;
2017         vm_object_t             new_object;
2018         int                     type_of_fault;
2019         vm_map_t                pmap_map = map;
2020         vm_map_t                original_map = map;
2021         pmap_t                  pmap = NULL;
2022         boolean_t               funnel_set = FALSE;
2023         funnel_t                *curflock;
2024         thread_t                cur_thread;
2025         boolean_t               interruptible_state;
2026
2027
2028         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_START,
2029                               vaddr,
2030                               0,
2031                               0,
2032                               0,
2033                               0);
2034
2035         cur_thread = current_thread();
2036
2037         interruptible_state = cur_thread->interruptible;
2038         if (interruptible == THREAD_UNINT)
2039                 cur_thread->interruptible = FALSE;
2040
2041         /*
2042          * assume we will hit a page in the cache
2043          * otherwise, explicitly override with
2044          * the real fault type once we determine it
2045          */
2046         type_of_fault = DBG_CACHE_HIT_FAULT;
2047
2048         VM_STAT(faults++);
2049         current_task()->faults++;
2050
2051         /*
2052          * drop funnel if it is already held. Then restore while returning
2053          */
2054         if ((cur_thread->funnel_state & TH_FN_OWNED) == TH_FN_OWNED) {
2055                 funnel_set = TRUE;
2056                 curflock = cur_thread->funnel_lock;
2057                 thread_funnel_set( curflock , FALSE);
2058         }
2059
2060     RetryFault: ;
2061
2062         /*
2063          *      Find the backing store object and offset into
2064          *      it to begin the search.
2065          */
2066         map = original_map;
2067         vm_map_lock_read(map);
2068         kr = vm_map_lookup_locked(&map, vaddr, fault_type, &version,
2069                                 &object, &offset,
2070                                 &prot, &wired,
2071                                 &behavior, &lo_offset, &hi_offset, &pmap_map);
2072
2073         pmap = pmap_map->pmap;
2074
2075         if (kr != KERN_SUCCESS) {
2076                 vm_map_unlock_read(map);
2077                 goto done;
2078         }
2079
2080         /*
2081          *      If the page is wired, we must fault for the current protection
2082          *      value, to avoid further faults.
2083          */
2084
2085         if (wired)
2086                 fault_type = prot | VM_PROT_WRITE;
2087
2088 #if     VM_FAULT_CLASSIFY
2089         /*
2090          *      Temporary data gathering code
2091          */
2092         vm_fault_classify(object, offset, fault_type);
2093 #endif
2094         /*
2095          *      Fast fault code.  The basic idea is to do as much as
2096          *      possible while holding the map lock and object locks.
2097          *      Busy pages are not used until the object lock has to
2098          *      be dropped to do something (copy, zero fill, pmap enter).
2099          *      Similarly, paging references aren't acquired until that
2100          *      point, and object references aren't used.
2101          *
2102          *      If we can figure out what to do
2103          *      (zero fill, copy on write, pmap enter) while holding
2104          *      the locks, then it gets done.  Otherwise, we give up,
2105          *      and use the original fault path (which doesn't hold
2106          *      the map lock, and relies on busy pages).
2107          *      The give up cases include:
2108          *              - Have to talk to pager.
2109          *              - Page is busy, absent or in error.
2110          *              - Pager has locked out desired access.
2111          *              - Fault needs to be restarted.
2112          *              - Have to push page into copy object.
2113          *
2114          *      The code is an infinite loop that moves one level down
2115          *      the shadow chain each time.  cur_object and cur_offset
2116          *      refer to the current object being examined. object and offset
2117          *      are the original object from the map.  The loop is at the
2118          *      top level if and only if object and cur_object are the same.
2119          *
2120          *      Invariants:  Map lock is held throughout.  Lock is held on
2121          *              original object and cur_object (if different) when
2122          *              continuing or exiting loop.
2123          *
2124          */
2125
2126
2127         /*
2128          *      If this page is to be inserted in a copy delay object
2129          *      for writing, and if the object has a copy, then the
2130          *      copy delay strategy is implemented in the slow fault page.
2131          */
2132         if (object->copy_strategy != MEMORY_OBJECT_COPY_DELAY ||
2133             object->copy == VM_OBJECT_NULL ||
2134             (fault_type & VM_PROT_WRITE) == 0) {
2135         cur_object = object;
2136         cur_offset = offset;
2137
2138         while (TRUE) {
2139                 m = vm_page_lookup(cur_object, cur_offset);
2140                 if (m != VM_PAGE_NULL) {
2141                         if (m->busy)
2142                                 break;
2143
2144                         if (m->unusual && (m->error || m->restart || m->private
2145                             || m->absent || (fault_type & m->page_lock))) {
2146
2147                         /*
2148                                  *      Unusual case. Give up.
2149                                  */
2150                                 break;
2151                         }
2152
2153                         /*
2154                          *      Two cases of map in faults:
2155                          *          - At top level w/o copy object.
2156                          *          - Read fault anywhere.
2157                          *              --> must disallow write.
2158                          */
2159
2160                         if (object == cur_object &&
2161                             object->copy == VM_OBJECT_NULL)
2162                                 goto FastMapInFault;
2163
2164                         if ((fault_type & VM_PROT_WRITE) == 0) {
2165
2166                                 prot &= ~VM_PROT_WRITE;
2167
2168                                 /*
2169                                  *      Set up to map the page ...
2170                                  *      mark the page busy, drop
2171                                  *      locks and take a paging reference
2172                                  *      on the object with the page.
2173                                  */
2174
2175                                 if (object != cur_object) {
2176                                         vm_object_unlock(object);
2177                                         object = cur_object;
2178                                 }
2179 FastMapInFault:
2180                                 m->busy = TRUE;
2181
2182                                 vm_object_paging_begin(object);
2183                                 vm_object_unlock(object);
2184
2185 FastPmapEnter:
2186                                 /*
2187                                  *      Check a couple of global reasons to
2188                                  *      be conservative about write access.
2189                                  *      Then do the pmap_enter.
2190                                  */
2191 #if     !VM_FAULT_STATIC_CONFIG
2192                                 if (vm_fault_dirty_handling
2193 #if     MACH_KDB
2194                                     || db_watchpoint_list
2195 #endif
2196                                     && (fault_type & VM_PROT_WRITE) == 0)
2197                                         prot &= ~VM_PROT_WRITE;
2198 #else   /* STATIC_CONFIG */
2199 #if     MACH_KDB
2200                                 if (db_watchpoint_list
2201                                     && (fault_type & VM_PROT_WRITE) == 0)
2202                                         prot &= ~VM_PROT_WRITE;
2203 #endif  /* MACH_KDB */
2204 #endif  /* STATIC_CONFIG */
2205                                 PMAP_ENTER(pmap, vaddr, m, prot, wired);
2206
2207                                 if (m->no_isync) {
2208                                         pmap_attribute(pmap,
2209                                                        vaddr,
2210                                                        PAGE_SIZE,
2211                                                        MATTR_CACHE,
2212                                                        &mv_cache_sync);
2213
2214                                 }
2215                                 {
2216                                    tws_hash_line_t      line;
2217                                    task_t               task;
2218
2219                                    task = current_task();
2220                                    if((map != NULL) &&
2221                                         (task->dynamic_working_set != 0)) {
2222                                         if(tws_lookup
2223                                                 ((tws_hash_t)
2224                                                 task->dynamic_working_set,
2225                                                 cur_offset, object,
2226                                                 &line) != KERN_SUCCESS) {
2227                                                 if(tws_insert((tws_hash_t)
2228                                                    task->dynamic_working_set,
2229                                                    m->offset, m->object,
2230                                                    vaddr, pmap_map)
2231                                                         == KERN_NO_SPACE) {
2232                                                    tws_expand_working_set(
2233                                                       task->dynamic_working_set,
2234                                                       TWS_HASH_LINE_COUNT);
2235                                                 }
2236                                         }
2237                                    }
2238                                 }
2239
2240                                 if (m->clustered) {
2241                                         vm_pagein_cluster_used++;
2242                                         m->clustered = FALSE;
2243                                 }
2244                                 /*
2245                                  *      Grab the object lock to manipulate
2246                                  *      the page queues.  Change wiring
2247                                  *      case is obvious.  In soft ref bits
2248                                  *      case activate page only if it fell
2249                                  *      off paging queues, otherwise just
2250                                  *      activate it if it's inactive.
2251                                  *
2252                                  *      NOTE: original vm_fault code will
2253                                  *      move active page to back of active
2254                                  *      queue.  This code doesn't.
2255                                  */
2256                                 vm_object_lock(object);
2257                                 vm_page_lock_queues();
2258                                 /*
2259                                  * we did the isync above... we're clearing
2260                                  * the flag here to avoid holding a lock
2261                                  * while calling pmap functions, however
2262                                  * we need hold the object lock before
2263                                  * we can modify the flag
2264                                  */
2265                                 m->no_isync = FALSE;
2266                                 m->reference = TRUE;
2267
2268                                 if (change_wiring) {
2269                                         if (wired)
2270                                                 vm_page_wire(m);
2271                                         else
2272                                                 vm_page_unwire(m);
2273                                 }
2274 #if VM_FAULT_STATIC_CONFIG
2275                                 else {
2276                                         if (!m->active && !m->inactive)
2277                                                 vm_page_activate(m);
2278                                 }
2279 #else
2280                                 else if (software_reference_bits) {
2281                                         if (!m->active && !m->inactive)
2282                                                 vm_page_activate(m);
2283                                 }
2284                                 else if (!m->active) {
2285                                         vm_page_activate(m);
2286                                 }
2287 #endif
2288                                 vm_page_unlock_queues();
2289
2290                                 /*
2291                                  *      That's it, clean up and return.
2292                                  */
2293                                 PAGE_WAKEUP_DONE(m);
2294                                 vm_object_paging_end(object);
2295                                 vm_object_unlock(object);
2296                                 vm_map_unlock_read(map);
2297                                 if(pmap_map != map)
2298                                         vm_map_unlock(pmap_map);
2299
2300                                 if (funnel_set) {
2301                                         thread_funnel_set( curflock, TRUE);
2302                                         funnel_set = FALSE;
2303                                 }
2304                                 cur_thread->interruptible = interruptible_state;
2305
2306                                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2307                                                       vaddr,
2308                                                       type_of_fault,
2309                                                       KERN_SUCCESS,
2310                                                       0,
2311                                                       0);
2312                                 return KERN_SUCCESS;
2313                         }
2314
2315                         /*
2316                          *      Copy on write fault.  If objects match, then
2317                          *      object->copy must not be NULL (else control
2318                          *      would be in previous code block), and we
2319                          *      have a potential push into the copy object
2320                          *      with which we won't cope here.
2321                          */
2322
2323                         if (cur_object == object)
2324                                 break;
2325
2326                         /*
2327                          *      This is now a shadow based copy on write
2328                          *      fault -- it requires a copy up the shadow
2329                          *      chain.
2330                          *
2331                          *      Allocate a page in the original top level
2332                          *      object. Give up if allocate fails.  Also
2333                          *      need to remember current page, as it's the
2334                          *      source of the copy.
2335                          */
2336                         cur_m = m;
2337                         m = vm_page_grab();
2338                         if (m == VM_PAGE_NULL) {
2339                                 break;
2340                         }
2341
2342                         /*
2343                          *      Now do the copy.  Mark the source busy
2344                          *      and take out paging references on both
2345                          *      objects.
2346                          *
2347                          *      NOTE: This code holds the map lock across
2348                          *      the page copy.
2349                          */
2350
2351                         cur_m->busy = TRUE;
2352                         vm_page_copy(cur_m, m);
2353                         vm_page_insert(m, object, offset);
2354
2355                         vm_object_paging_begin(cur_object);
2356                         vm_object_paging_begin(object);
2357
2358                         type_of_fault = DBG_COW_FAULT;
2359                         VM_STAT(cow_faults++);
2360                         current_task()->cow_faults++;
2361
2362                         /*
2363                          *      Now cope with the source page and object
2364                          *      If the top object has a ref count of 1
2365                          *      then no other map can access it, and hence
2366                          *      it's not necessary to do the pmap_page_protect.
2367                          */
2368
2369
2370                         vm_page_lock_queues();
2371                         vm_page_deactivate(cur_m);
2372                         m->dirty = TRUE;
2373                         pmap_page_protect(cur_m->phys_addr,
2374                                                   VM_PROT_NONE);
2375                         vm_page_unlock_queues();
2376
2377                         PAGE_WAKEUP_DONE(cur_m);
2378                         vm_object_paging_end(cur_object);
2379                         vm_object_unlock(cur_object);
2380
2381                         /*
2382                          *      Slight hack to call vm_object collapse
2383                          *      and then reuse common map in code.
2384                          *      note that the object lock was taken above.
2385                          */
2386
2387                         vm_object_paging_end(object);
2388                         vm_object_collapse(object);
2389                         vm_object_paging_begin(object);
2390                         vm_object_unlock(object);
2391
2392                         goto FastPmapEnter;
2393                 }
2394                 else {
2395
2396                         /*
2397                          *      No page at cur_object, cur_offset
2398                          */
2399
2400                         if (cur_object->pager_created) {
2401
2402                                 /*
2403                                  *      Have to talk to the pager.  Give up.
2404                                  */
2405
2406                                 break;
2407                         }
2408
2409
2410                         if (cur_object->shadow == VM_OBJECT_NULL) {
2411
2412                                 if (cur_object->shadow_severed) {
2413                                         vm_object_paging_end(object);
2414                                         vm_object_unlock(object);
2415                                         vm_map_unlock_read(map);
2416                                         if(pmap_map != map)
2417                                                 vm_map_unlock(pmap_map);
2418
2419                                         if (funnel_set) {
2420                                                 thread_funnel_set( curflock, TRUE);
2421                                                 funnel_set = FALSE;
2422                                         }
2423                                         cur_thread->interruptible = interruptible_state;
2424
2425                                         return VM_FAULT_MEMORY_ERROR;
2426                                 }
2427
2428                                 /*
2429                                  *      Zero fill fault.  Page gets
2430                                  *      filled in top object. Insert
2431                                  *      page, then drop any lower lock.
2432                                  *      Give up if no page.
2433                                  */
2434                                 if ((vm_page_free_target -
2435                                    ((vm_page_free_target-vm_page_free_min)>>2))
2436                                                 > vm_page_free_count) {
2437                                         break;
2438                                 }
2439                                 m = vm_page_alloc(object, offset);
2440                                 if (m == VM_PAGE_NULL) {
2441                                         break;
2442                                 }
2443                                 /*
2444                                  * This is a zero-fill or initial fill
2445                                  * page fault.  As such, we consider it
2446                                  * undefined with respect to instruction
2447                                  * execution.  i.e. it is the responsibility
2448                                  * of higher layers to call for an instruction
2449                                  * sync after changing the contents and before
2450                                  * sending a program into this area.  We
2451                                  * choose this approach for performance
2452                                  */
2453
2454                                 m->no_isync = FALSE;
2455
2456                                 if (cur_object != object)
2457                                         vm_object_unlock(cur_object);
2458
2459                                 vm_object_paging_begin(object);
2460                                 vm_object_unlock(object);
2461
2462                                 /*
2463                                  *      Now zero fill page and map it.
2464                                  *      the page is probably going to
2465                                  *      be written soon, so don't bother
2466                                  *      to clear the modified bit
2467                                  *
2468                                  *      NOTE: This code holds the map
2469                                  *      lock across the zero fill.
2470                                  */
2471
2472                                 if (!map->no_zero_fill) {
2473                                         vm_page_zero_fill(m);
2474                                         type_of_fault = DBG_ZERO_FILL_FAULT;
2475                                         VM_STAT(zero_fill_count++);
2476                                 }
2477                                 vm_page_lock_queues();
2478                                 VM_PAGE_QUEUES_REMOVE(m);
2479
2480                                 m->page_ticket = vm_page_ticket;
2481                                 vm_page_ticket_roll++;
2482                                 if(vm_page_ticket_roll ==
2483                                                 VM_PAGE_TICKETS_IN_ROLL) {
2484                                         vm_page_ticket_roll = 0;
2485                                         if(vm_page_ticket ==
2486                                                 VM_PAGE_TICKET_ROLL_IDS)
2487                                                 vm_page_ticket= 0;
2488                                         else
2489                                                 vm_page_ticket++;
2490                                 }
2491
2492                                 queue_enter(&vm_page_queue_inactive,
2493                                                         m, vm_page_t, pageq);
2494                                 m->inactive = TRUE;
2495                                 vm_page_inactive_count++;
2496                                 vm_page_unlock_queues();
2497                                 goto FastPmapEnter;
2498                         }
2499
2500                         /*
2501                          *      On to the next level
2502                          */
2503
2504                         cur_offset += cur_object->shadow_offset;
2505                         new_object = cur_object->shadow;
2506                         vm_object_lock(new_object);
2507                         if (cur_object != object)
2508                                 vm_object_unlock(cur_object);
2509                         cur_object = new_object;
2510
2511                         continue;
2512                 }
2513         }
2514
2515         /*
2516          *      Cleanup from fast fault failure.  Drop any object
2517          *      lock other than original and drop map lock.
2518          */
2519
2520         if (object != cur_object)
2521                 vm_object_unlock(cur_object);
2522         }
2523         vm_map_unlock_read(map);
2524         if(pmap_map != map)
2525                 vm_map_unlock(pmap_map);
2526
2527         /*
2528          *      Make a reference to this object to
2529          *      prevent its disposal while we are messing with
2530          *      it.  Once we have the reference, the map is free
2531          *      to be diddled.  Since objects reference their
2532          *      shadows (and copies), they will stay around as well.
2533          */
2534
2535         assert(object->ref_count > 0);
2536         object->ref_count++;
2537         vm_object_res_reference(object);
2538         vm_object_paging_begin(object);
2539
2540         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2541         kr = vm_fault_page(object, offset, fault_type,
2542                            (change_wiring && !wired),
2543                            interruptible,
2544                            lo_offset, hi_offset, behavior,
2545                            &prot, &result_page, &top_page,
2546                            &type_of_fault,
2547                            &error_code, map->no_zero_fill, FALSE, map, vaddr);
2548
2549         /*
2550          *      If we didn't succeed, lose the object reference immediately.
2551          */
2552
2553         if (kr != VM_FAULT_SUCCESS)
2554                 vm_object_deallocate(object);
2555
2556         /*
2557          *      See why we failed, and take corrective action.
2558          */
2559
2560         switch (kr) {
2561                 case VM_FAULT_SUCCESS:
2562                         break;
2563                 case VM_FAULT_MEMORY_SHORTAGE:
2564                         if (vm_page_wait((change_wiring) ?
2565                                          THREAD_UNINT :
2566                                          THREAD_ABORTSAFE))
2567                                 goto RetryFault;
2568                         /* fall thru */
2569                 case VM_FAULT_INTERRUPTED:
2570                         kr = KERN_ABORTED;
2571                         goto done;
2572                 case VM_FAULT_RETRY:
2573                         goto RetryFault;
2574                 case VM_FAULT_FICTITIOUS_SHORTAGE:
2575                         vm_page_more_fictitious();
2576                         goto RetryFault;
2577                 case VM_FAULT_MEMORY_ERROR:
2578                         if (error_code)
2579                                 kr = error_code;
2580                         else
2581                                 kr = KERN_MEMORY_ERROR;
2582                         goto done;
2583         }
2584
2585         m = result_page;
2586
2587         if(m != VM_PAGE_NULL) {
2588                 assert((change_wiring && !wired) ?
2589                     (top_page == VM_PAGE_NULL) :
2590                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
2591         }
2592
2593         /*
2594          *      How to clean up the result of vm_fault_page.  This
2595          *      happens whether the mapping is entered or not.
2596          */
2597
2598 #define UNLOCK_AND_DEALLOCATE                           \
2599         MACRO_BEGIN                                     \
2600         vm_fault_cleanup(m->object, top_page);          \
2601         vm_object_deallocate(object);                   \
2602         MACRO_END
2603
2604         /*
2605          *      What to do with the resulting page from vm_fault_page
2606          *      if it doesn't get entered into the physical map:
2607          */
2608
2609 #define RELEASE_PAGE(m)                                 \
2610         MACRO_BEGIN                                     \
2611         PAGE_WAKEUP_DONE(m);                            \
2612         vm_page_lock_queues();                          \
2613         if (!m->active && !m->inactive)                 \
2614                 vm_page_activate(m);                    \
2615         vm_page_unlock_queues();                        \
2616         MACRO_END
2617
2618         /*
2619          *      We must verify that the maps have not changed
2620          *      since our last lookup.
2621          */
2622
2623         if(m != VM_PAGE_NULL) {
2624                 old_copy_object = m->object->copy;
2625
2626                 vm_object_unlock(m->object);
2627         } else {
2628                 old_copy_object = VM_OBJECT_NULL;
2629         }
2630         if ((map != original_map) || !vm_map_verify(map, &version)) {
2631                 vm_object_t             retry_object;
2632                 vm_object_offset_t      retry_offset;
2633                 vm_prot_t               retry_prot;
2634
2635                 /*
2636                  *      To avoid trying to write_lock the map while another
2637                  *      thread has it read_locked (in vm_map_pageable), we
2638                  *      do not try for write permission.  If the page is
2639                  *      still writable, we will get write permission.  If it
2640                  *      is not, or has been marked needs_copy, we enter the
2641                  *      mapping without write permission, and will merely
2642                  *      take another fault.
2643                  */
2644                 map = original_map;
2645                 vm_map_lock_read(map);
2646                 kr = vm_map_lookup_locked(&map, vaddr,
2647                                    fault_type & ~VM_PROT_WRITE, &version,
2648                                    &retry_object, &retry_offset, &retry_prot,
2649                                    &wired, &behavior, &lo_offset, &hi_offset,
2650                                    &pmap_map);
2651                 pmap = pmap_map->pmap;
2652
2653                 if (kr != KERN_SUCCESS) {
2654                         vm_map_unlock_read(map);
2655                         if(m != VM_PAGE_NULL) {
2656                                 vm_object_lock(m->object);
2657                                 RELEASE_PAGE(m);
2658                                 UNLOCK_AND_DEALLOCATE;
2659                         } else {
2660                                 vm_object_deallocate(object);
2661                         }
2662                         goto done;
2663                 }
2664
2665                 vm_object_unlock(retry_object);
2666                 if(m != VM_PAGE_NULL) {
2667                         vm_object_lock(m->object);
2668                 } else {
2669                         vm_object_lock(object);
2670                 }
2671
2672                 if ((retry_object != object) ||
2673                     (retry_offset != offset)) {
2674                         vm_map_unlock_read(map);
2675                         if(pmap_map != map)
2676                                 vm_map_unlock(pmap_map);
2677                         if(m != VM_PAGE_NULL) {
2678                                 RELEASE_PAGE(m);
2679                                 UNLOCK_AND_DEALLOCATE;
2680                         } else {
2681                                 vm_object_deallocate(object);
2682                         }
2683                         goto RetryFault;
2684                 }
2685
2686                 /*
2687                  *      Check whether the protection has changed or the object
2688                  *      has been copied while we left the map unlocked.
2689                  */
2690                 prot &= retry_prot;
2691                 if(m != VM_PAGE_NULL) {
2692                         vm_object_unlock(m->object);
2693                 } else {
2694                         vm_object_unlock(object);
2695                 }
2696         }
2697         if(m != VM_PAGE_NULL) {
2698                 vm_object_lock(m->object);
2699         } else {
2700                 vm_object_lock(object);
2701         }
2702
2703         /*
2704          *      If the copy object changed while the top-level object
2705          *      was unlocked, then we must take away write permission.
2706          */
2707
2708         if(m != VM_PAGE_NULL) {
2709                 if (m->object->copy != old_copy_object)
2710                         prot &= ~VM_PROT_WRITE;
2711         }
2712
2713         /*
2714          *      If we want to wire down this page, but no longer have
2715          *      adequate permissions, we must start all over.
2716          */
2717
2718         if (wired && (fault_type != (prot|VM_PROT_WRITE))) {
2719                 vm_map_verify_done(map, &version);
2720                 if(pmap_map != map)
2721                         vm_map_unlock(pmap_map);
2722                 if(m != VM_PAGE_NULL) {
2723                         RELEASE_PAGE(m);
2724                         UNLOCK_AND_DEALLOCATE;
2725                 } else {
2726                         vm_object_deallocate(object);
2727                 }
2728                 goto RetryFault;
2729         }
2730
2731         /*
2732          *      Put this page into the physical map.
2733          *      We had to do the unlock above because pmap_enter
2734          *      may cause other faults.  The page may be on
2735          *      the pageout queues.  If the pageout daemon comes
2736          *      across the page, it will remove it from the queues.
2737          */
2738         if(m != VM_PAGE_NULL) {
2739                 if (m->no_isync) {
2740                 m->no_isync = FALSE;
2741
2742                 vm_object_unlock(m->object);
2743
2744                         PMAP_ENTER(pmap, vaddr, m, prot, wired);
2745
2746                         /*
2747                          *      It's critically important that a wired-down page be faulted
2748                          *      only once in each map for which it is wired.
2749                         */
2750                         /* Sync I & D caches for new mapping */
2751                         pmap_attribute(pmap,
2752                                vaddr,
2753                                PAGE_SIZE,
2754                                MATTR_CACHE,
2755                                &mv_cache_sync);
2756                 } else {
2757                 vm_object_unlock(m->object);
2758
2759                         PMAP_ENTER(pmap, vaddr, m, prot, wired);
2760                 }
2761                 {
2762                         tws_hash_line_t line;
2763                         task_t          task;
2764
2765                            task = current_task();
2766                            if((map != NULL) &&
2767                                 (task->dynamic_working_set != 0)) {
2768                                 if(tws_lookup
2769                                         ((tws_hash_t)
2770                                         task->dynamic_working_set,
2771                                         m->offset, m->object,
2772                                         &line) != KERN_SUCCESS) {
2773                                         tws_insert((tws_hash_t)
2774                                            task->dynamic_working_set,
2775                                            m->offset, m->object,
2776                                            vaddr, pmap_map);
2777                                         if(tws_insert((tws_hash_t)
2778                                                    task->dynamic_working_set,
2779                                                    m->offset, m->object,
2780                                                    vaddr, pmap_map)
2781                                                                 == KERN_NO_SPACE) {
2782                                                 tws_expand_working_set(
2783                                                         task->dynamic_working_set,
2784                                                         TWS_HASH_LINE_COUNT);
2785                                         }
2786                                 }
2787                         }
2788                 }
2789         } else {
2790
2791 /*  if __ppc__  not working until figure out phys copy on block maps */
2792 #ifdef notdefcdy
2793                 int     memattr;
2794                 struct  phys_entry      *pp;
2795                 /*
2796                  * do a pmap block mapping from the physical address
2797                  * in the object
2798                  */
2799                 if(pp = pmap_find_physentry(
2800                         (vm_offset_t)object->shadow_offset)) {
2801                         memattr = ((pp->pte1 & 0x00000078) >> 3);
2802                 } else {
2803                         memattr = PTE_WIMG_UNCACHED_COHERENT_GUARDED;
2804                 }
2805
2806                 pmap_map_block(pmap, vaddr,
2807                         (vm_offset_t)object->shadow_offset,
2808                         object->size, prot,
2809                         memattr, 0); /* Set up a block mapped area */
2810 //#else
2811                 vm_offset_t     off;
2812                 for (off = 0; off < object->size; off += page_size) {
2813                         pmap_enter(pmap, vaddr + off,
2814                                 object->shadow_offset + off, prot, TRUE);
2815                         /* Map it in */
2816                 }
2817 #endif
2818
2819         }
2820
2821         /*
2822          *      If the page is not wired down and isn't already
2823          *      on a pageout queue, then put it where the
2824          *      pageout daemon can find it.
2825          */
2826         if(m != VM_PAGE_NULL) {
2827                 vm_object_lock(m->object);
2828                 vm_page_lock_queues();
2829
2830                 if (change_wiring) {
2831                         if (wired)
2832                                 vm_page_wire(m);
2833                         else
2834                                 vm_page_unwire(m);
2835                 }
2836 #if     VM_FAULT_STATIC_CONFIG
2837                 else {
2838                         if (!m->active && !m->inactive)
2839                                 vm_page_activate(m);
2840                         m->reference = TRUE;
2841                 }
2842 #else
2843                 else if (software_reference_bits) {
2844                         if (!m->active && !m->inactive)
2845                                 vm_page_activate(m);
2846                         m->reference = TRUE;
2847                 } else {
2848                         vm_page_activate(m);
2849                 }
2850 #endif
2851                 vm_page_unlock_queues();
2852         }
2853
2854         /*
2855          *      Unlock everything, and return
2856          */
2857
2858         vm_map_verify_done(map, &version);
2859         if(pmap_map != map)
2860                 vm_map_unlock(pmap_map);
2861         if(m != VM_PAGE_NULL) {
2862                 PAGE_WAKEUP_DONE(m);
2863                 UNLOCK_AND_DEALLOCATE;
2864         } else {
2865                 vm_fault_cleanup(object, top_page);
2866                 vm_object_deallocate(object);
2867         }
2868         kr = KERN_SUCCESS;
2869
2870 #undef  UNLOCK_AND_DEALLOCATE
2871 #undef  RELEASE_PAGE
2872
2873     done:
2874         if (funnel_set) {
2875                 thread_funnel_set( curflock, TRUE);
2876                 funnel_set = FALSE;
2877         }
2878         cur_thread->interruptible = interruptible_state;
2879
2880         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2881                               vaddr,
2882                               type_of_fault,
2883                               kr,
2884                               0,
2885                               0);
2886         return(kr);
2887 }
2888
2889 /*
2890  *      vm_fault_wire:
2891  *
2892  *      Wire down a range of virtual addresses in a map.
2893  */
2894 kern_return_t
2895 vm_fault_wire(
2896         vm_map_t        map,
2897         vm_map_entry_t  entry,
2898         pmap_t          pmap)
2899 {
2900
2901         register vm_offset_t    va;
2902         register vm_offset_t    end_addr = entry->vme_end;
2903         register kern_return_t  rc;
2904
2905         assert(entry->in_transition);
2906
2907         /*
2908          *      Inform the physical mapping system that the
2909          *      range of addresses may not fault, so that
2910          *      page tables and such can be locked down as well.
2911          */
2912
2913         pmap_pageable(pmap, entry->vme_start, end_addr, FALSE);
2914
2915         /*
2916          *      We simulate a fault to get the page and enter it
2917          *      in the physical map.
2918          */
2919
2920         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
2921                 if ((rc = vm_fault_wire_fast(
2922                                 map, va, entry, pmap)) != KERN_SUCCESS) {
2923                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
2924                                       (pmap == kernel_pmap) ? THREAD_UNINT : THREAD_ABORTSAFE);
2925                 }
2926
2927                 if (rc != KERN_SUCCESS) {
2928                         struct vm_map_entry     tmp_entry = *entry;
2929
2930                         /* unwire wired pages */
2931                         tmp_entry.vme_end = va;
2932                         vm_fault_unwire(map, &tmp_entry, FALSE, pmap);
2933
2934                         return rc;
2935                 }
2936         }
2937         return KERN_SUCCESS;
2938 }
2939
2940 /*
2941  *      vm_fault_unwire:
2942  *
2943  *      Unwire a range of virtual addresses in a map.
2944  */
2945 void
2946 vm_fault_unwire(
2947         vm_map_t        map,
2948         vm_map_entry_t  entry,
2949         boolean_t       deallocate,
2950         pmap_t          pmap)
2951 {
2952         register vm_offset_t    va;
2953         register vm_offset_t    end_addr = entry->vme_end;
2954         vm_object_t             object;
2955
2956         object = (entry->is_sub_map)
2957                         ? VM_OBJECT_NULL : entry->object.vm_object;
2958
2959         /*
2960          *      Since the pages are wired down, we must be able to
2961          *      get their mappings from the physical map system.
2962          */
2963
2964         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
2965                 pmap_change_wiring(pmap, va, FALSE);
2966
2967                 if (object == VM_OBJECT_NULL) {
2968                         (void) vm_fault(map, va, VM_PROT_NONE, TRUE, THREAD_UNINT);
2969                 } else {
2970                         vm_prot_t       prot;
2971                         vm_page_t       result_page;
2972                         vm_page_t       top_page;
2973                         vm_object_t     result_object;
2974                         vm_fault_return_t result;
2975
2976                         do {
2977                                 prot = VM_PROT_NONE;
2978
2979                                 vm_object_lock(object);
2980                                 vm_object_paging_begin(object);
2981                                 XPR(XPR_VM_FAULT,
2982                                         "vm_fault_unwire -> vm_fault_page\n",
2983                                         0,0,0,0,0);
2984                                 result = vm_fault_page(object,
2985                                                 entry->offset +
2986                                                   (va - entry->vme_start),
2987                                                 VM_PROT_NONE, TRUE,
2988                                                 THREAD_UNINT,
2989                                                 entry->offset,
2990                                                 entry->offset +
2991                                                        (entry->vme_end
2992                                                         - entry->vme_start),
2993                                                 entry->behavior,
2994                                                 &prot,
2995                                                 &result_page,
2996                                                 &top_page,
2997                                                 (int *)0,
2998                                                 0, map->no_zero_fill,
2999                                                 FALSE, NULL, 0);
3000                         } while (result == VM_FAULT_RETRY);
3001
3002                         if (result != VM_FAULT_SUCCESS)
3003                                 panic("vm_fault_unwire: failure");
3004
3005                         result_object = result_page->object;
3006                         if (deallocate) {
3007                                 assert(!result_page->fictitious);
3008                                 pmap_page_protect(result_page->phys_addr,
3009                                                 VM_PROT_NONE);
3010                                 VM_PAGE_FREE(result_page);
3011                         } else {
3012                                 vm_page_lock_queues();
3013                                 vm_page_unwire(result_page);
3014                                 vm_page_unlock_queues();
3015                                 PAGE_WAKEUP_DONE(result_page);
3016                         }
3017
3018                         vm_fault_cleanup(result_object, top_page);
3019                 }
3020         }
3021
3022         /*
3023          *      Inform the physical mapping system that the range
3024          *      of addresses may fault, so that page tables and
3025          *      such may be unwired themselves.
3026          */
3027
3028         pmap_pageable(pmap, entry->vme_start, end_addr, TRUE);
3029
3030 }
3031
3032 /*
3033  *      vm_fault_wire_fast:
3034  *
3035  *      Handle common case of a wire down page fault at the given address.
3036  *      If successful, the page is inserted into the associated physical map.
3037  *      The map entry is passed in to avoid the overhead of a map lookup.
3038  *
3039  *      NOTE: the given address should be truncated to the
3040  *      proper page address.
3041  *
3042  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
3043  *      a standard error specifying why the fault is fatal is returned.
3044  *
3045  *      The map in question must be referenced, and remains so.
3046  *      Caller has a read lock on the map.
3047  *
3048  *      This is a stripped version of vm_fault() for wiring pages.  Anything
3049  *      other than the common case will return KERN_FAILURE, and the caller
3050  *      is expected to call vm_fault().
3051  */
3052 kern_return_t
3053 vm_fault_wire_fast(
3054         vm_map_t        map,
3055         vm_offset_t     va,
3056         vm_map_entry_t  entry,
3057         pmap_t          pmap)
3058 {
3059         vm_object_t             object;
3060         vm_object_offset_t      offset;
3061         register vm_page_t      m;
3062         vm_prot_t               prot;
3063         thread_act_t            thr_act;
3064
3065         VM_STAT(faults++);
3066
3067         if((thr_act=current_act()) && (thr_act->task != TASK_NULL))
3068           thr_act->task->faults++;
3069
3070 /*
3071  *      Recovery actions
3072  */
3073
3074 #undef  RELEASE_PAGE
3075 #define RELEASE_PAGE(m) {                               \
3076         PAGE_WAKEUP_DONE(m);                            \
3077         vm_page_lock_queues();                          \
3078         vm_page_unwire(m);                              \
3079         vm_page_unlock_queues();                        \
3080 }
3081
3082
3083 #undef  UNLOCK_THINGS
3084 #define UNLOCK_THINGS   {                               \
3085         object->paging_in_progress--;                   \
3086         vm_object_unlock(object);                       \
3087 }
3088
3089 #undef  UNLOCK_AND_DEALLOCATE
3090 #define UNLOCK_AND_DEALLOCATE   {                       \
3091         UNLOCK_THINGS;                                  \
3092         vm_object_deallocate(object);                   \
3093 }
3094 /*
3095  *      Give up and have caller do things the hard way.
3096  */
3097
3098 #define GIVE_UP {                                       \
3099         UNLOCK_AND_DEALLOCATE;                          \
3100         return(KERN_FAILURE);                           \
3101 }
3102
3103
3104         /*
3105          *      If this entry is not directly to a vm_object, bail out.
3106          */
3107         if (entry->is_sub_map)
3108                 return(KERN_FAILURE);
3109
3110         /*
3111          *      Find the backing store object and offset into it.
3112          */
3113
3114         object = entry->object.vm_object;
3115         offset = (va - entry->vme_start) + entry->offset;
3116         prot = entry->protection;
3117
3118         /*
3119          *      Make a reference to this object to prevent its
3120          *      disposal while we are messing with it.
3121          */
3122
3123         vm_object_lock(object);
3124         assert(object->ref_count > 0);
3125         object->ref_count++;
3126         vm_object_res_reference(object);
3127         object->paging_in_progress++;
3128
3129         /*
3130          *      INVARIANTS (through entire routine):
3131          *
3132          *      1)      At all times, we must either have the object
3133          *              lock or a busy page in some object to prevent
3134          *              some other thread from trying to bring in
3135          *              the same page.
3136          *
3137          *      2)      Once we have a busy page, we must remove it from
3138          *              the pageout queues, so that the pageout daemon
3139          *              will not grab it away.
3140          *
3141          */
3142
3143         /*
3144          *      Look for page in top-level object.  If it's not there or
3145          *      there's something going on, give up.
3146          */
3147         m = vm_page_lookup(object, offset);
3148         if ((m == VM_PAGE_NULL) || (m->busy) ||
3149             (m->unusual && ( m->error || m->restart || m->absent ||
3150                                 prot & m->page_lock))) {
3151
3152                 GIVE_UP;
3153         }
3154
3155         /*
3156          *      Wire the page down now.  All bail outs beyond this
3157          *      point must unwire the page.
3158          */
3159
3160         vm_page_lock_queues();
3161         vm_page_wire(m);
3162         vm_page_unlock_queues();
3163
3164         /*
3165          *      Mark page busy for other threads.
3166          */
3167         assert(!m->busy);
3168         m->busy = TRUE;
3169         assert(!m->absent);
3170
3171         /*
3172          *      Give up if the page is being written and there's a copy object
3173          */
3174         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3175                 RELEASE_PAGE(m);
3176                 GIVE_UP;
3177         }
3178
3179         /*
3180          *      Put this page into the physical map.
3181          *      We have to unlock the object because pmap_enter
3182          *      may cause other faults.
3183          */
3184         if (m->no_isync) {
3185                 m->no_isync = FALSE;
3186
3187                 vm_object_unlock(object);
3188
3189                 PMAP_ENTER(pmap, va, m, prot, TRUE);
3190
3191                 /* Sync I & D caches for new mapping */
3192                 pmap_attribute(pmap,
3193                                va,
3194                                PAGE_SIZE,
3195                                MATTR_CACHE,
3196                                &mv_cache_sync);
3197
3198         } else {
3199                 vm_object_unlock(object);
3200
3201                 PMAP_ENTER(pmap, va, m, prot, TRUE);
3202         }
3203
3204         /*
3205          *      Must relock object so that paging_in_progress can be cleared.
3206          */
3207         vm_object_lock(object);
3208
3209         /*
3210          *      Unlock everything, and return
3211          */
3212
3213         PAGE_WAKEUP_DONE(m);
3214         UNLOCK_AND_DEALLOCATE;
3215
3216         return(KERN_SUCCESS);
3217
3218 }
3219
3220 /*
3221  *      Routine:        vm_fault_copy_cleanup
3222  *      Purpose:
3223  *              Release a page used by vm_fault_copy.
3224  */
3225
3226 void
3227 vm_fault_copy_cleanup(
3228         vm_page_t       page,
3229         vm_page_t       top_page)
3230 {
3231         vm_object_t     object = page->object;
3232
3233         vm_object_lock(object);
3234         PAGE_WAKEUP_DONE(page);
3235         vm_page_lock_queues();
3236         if (!page->active && !page->inactive)
3237                 vm_page_activate(page);
3238         vm_page_unlock_queues();
3239         vm_fault_cleanup(object, top_page);
3240 }
3241
3242 void
3243 vm_fault_copy_dst_cleanup(
3244         vm_page_t       page)
3245 {
3246         vm_object_t     object;
3247
3248         if (page != VM_PAGE_NULL) {
3249                 object = page->object;
3250                 vm_object_lock(object);
3251                 vm_page_lock_queues();
3252                 vm_page_unwire(page);
3253                 vm_page_unlock_queues();
3254                 vm_object_paging_end(object);
3255                 vm_object_unlock(object);
3256         }
3257 }
3258
3259 /*
3260  *      Routine:        vm_fault_copy
3261  *
3262  *      Purpose:
3263  *              Copy pages from one virtual memory object to another --
3264  *              neither the source nor destination pages need be resident.
3265  *
3266  *              Before actually copying a page, the version associated with
3267  *              the destination address map wil be verified.
3268  *
3269  *      In/out conditions:
3270  *              The caller must hold a reference, but not a lock, to
3271  *              each of the source and destination objects and to the
3272  *              destination map.
3273  *
3274  *      Results:
3275  *              Returns KERN_SUCCESS if no errors were encountered in
3276  *              reading or writing the data.  Returns KERN_INTERRUPTED if
3277  *              the operation was interrupted (only possible if the
3278  *              "interruptible" argument is asserted).  Other return values
3279  *              indicate a permanent error in copying the data.
3280  *
3281  *              The actual amount of data copied will be returned in the
3282  *              "copy_size" argument.  In the event that the destination map
3283  *              verification failed, this amount may be less than the amount
3284  *              requested.
3285  */
3286 kern_return_t
3287 vm_fault_copy(
3288         vm_object_t             src_object,
3289         vm_object_offset_t      src_offset,
3290         vm_size_t               *src_size,              /* INOUT */
3291         vm_object_t             dst_object,
3292         vm_object_offset_t      dst_offset,
3293         vm_map_t                dst_map,
3294         vm_map_version_t         *dst_version,
3295         int                     interruptible)
3296 {
3297         vm_page_t               result_page;
3298
3299         vm_page_t               src_page;
3300         vm_page_t               src_top_page;
3301         vm_prot_t               src_prot;
3302
3303         vm_page_t               dst_page;
3304         vm_page_t               dst_top_page;
3305         vm_prot_t               dst_prot;
3306
3307         vm_size_t               amount_left;
3308         vm_object_t             old_copy_object;
3309         kern_return_t           error = 0;
3310
3311         vm_size_t               part_size;
3312
3313         /*
3314          * In order not to confuse the clustered pageins, align
3315          * the different offsets on a page boundary.
3316          */
3317         vm_object_offset_t      src_lo_offset = trunc_page_64(src_offset);
3318         vm_object_offset_t      dst_lo_offset = trunc_page_64(dst_offset);
3319         vm_object_offset_t      src_hi_offset = round_page_64(src_offset + *src_size);
3320         vm_object_offset_t      dst_hi_offset = round_page_64(dst_offset + *src_size);
3321
3322 #define RETURN(x)                                       \
3323         MACRO_BEGIN                                     \
3324         *src_size -= amount_left;                       \
3325         MACRO_RETURN(x);                                \
3326         MACRO_END
3327
3328         amount_left = *src_size;
3329         do { /* while (amount_left > 0) */
3330                 /*
3331                  * There may be a deadlock if both source and destination
3332                  * pages are the same. To avoid this deadlock, the copy must
3333                  * start by getting the destination page in order to apply
3334                  * COW semantics if any.
3335                  */
3336
3337         RetryDestinationFault: ;
3338
3339                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3340
3341                 vm_object_lock(dst_object);
3342                 vm_object_paging_begin(dst_object);
3343
3344                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3345                 switch (vm_fault_page(dst_object,
3346                                       trunc_page_64(dst_offset),
3347                                       VM_PROT_WRITE|VM_PROT_READ,
3348                                       FALSE,
3349                                       interruptible,
3350                                       dst_lo_offset,
3351                                       dst_hi_offset,
3352                                       VM_BEHAVIOR_SEQUENTIAL,
3353                                       &dst_prot,
3354                                       &dst_page,
3355                                       &dst_top_page,
3356                                       (int *)0,
3357                                       &error,
3358                                       dst_map->no_zero_fill,
3359                                       FALSE, NULL, 0)) {
3360                 case VM_FAULT_SUCCESS:
3361                         break;
3362                 case VM_FAULT_RETRY:
3363                         goto RetryDestinationFault;
3364                 case VM_FAULT_MEMORY_SHORTAGE:
3365                         if (vm_page_wait(interruptible))
3366                                 goto RetryDestinationFault;
3367                         /* fall thru */
3368                 case VM_FAULT_INTERRUPTED:
3369                         RETURN(MACH_SEND_INTERRUPTED);
3370                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3371                         vm_page_more_fictitious();
3372                         goto RetryDestinationFault;
3373                 case VM_FAULT_MEMORY_ERROR:
3374                         if (error)
3375                                 return (error);
3376                         else
3377                                 return(KERN_MEMORY_ERROR);
3378                 }
3379                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3380
3381                 old_copy_object = dst_page->object->copy;
3382
3383                 /*
3384                  * There exists the possiblity that the source and
3385                  * destination page are the same.  But we can't
3386                  * easily determine that now.  If they are the
3387                  * same, the call to vm_fault_page() for the
3388                  * destination page will deadlock.  To prevent this we
3389                  * wire the page so we can drop busy without having
3390                  * the page daemon steal the page.  We clean up the
3391                  * top page  but keep the paging reference on the object
3392                  * holding the dest page so it doesn't go away.
3393                  */
3394
3395                 vm_page_lock_queues();
3396                 vm_page_wire(dst_page);
3397                 vm_page_unlock_queues();
3398                 PAGE_WAKEUP_DONE(dst_page);
3399                 vm_object_unlock(dst_page->object);
3400
3401                 if (dst_top_page != VM_PAGE_NULL) {
3402                         vm_object_lock(dst_object);
3403                         VM_PAGE_FREE(dst_top_page);
3404                         vm_object_paging_end(dst_object);
3405                         vm_object_unlock(dst_object);
3406                 }
3407
3408         RetrySourceFault: ;
3409
3410                 if (src_object == VM_OBJECT_NULL) {
3411                         /*
3412                          *      No source object.  We will just
3413                          *      zero-fill the page in dst_object.
3414                          */
3415                         src_page = VM_PAGE_NULL;
3416                         result_page = VM_PAGE_NULL;
3417                 } else {
3418                         vm_object_lock(src_object);
3419                         src_page = vm_page_lookup(src_object,
3420                                                   trunc_page_64(src_offset));
3421                         if (src_page == dst_page) {
3422                                 src_prot = dst_prot;
3423                                 result_page = VM_PAGE_NULL;
3424                         } else {
3425                                 src_prot = VM_PROT_READ;
3426                                 vm_object_paging_begin(src_object);
3427
3428                                 XPR(XPR_VM_FAULT,
3429                                         "vm_fault_copy(2) -> vm_fault_page\n",
3430                                         0,0,0,0,0);
3431                                 switch (vm_fault_page(src_object,
3432                                                       trunc_page_64(src_offset),
3433                                                       VM_PROT_READ,
3434                                                       FALSE,
3435                                                       interruptible,
3436                                                       src_lo_offset,
3437                                                       src_hi_offset,
3438                                                       VM_BEHAVIOR_SEQUENTIAL,
3439                                                       &src_prot,
3440                                                       &result_page,
3441                                                       &src_top_page,
3442                                                       (int *)0,
3443                                                       &error,
3444                                                       FALSE,
3445                                                       FALSE, NULL, 0)) {
3446
3447                                 case VM_FAULT_SUCCESS:
3448                                         break;
3449                                 case VM_FAULT_RETRY:
3450                                         goto RetrySourceFault;
3451                                 case VM_FAULT_MEMORY_SHORTAGE:
3452                                         if (vm_page_wait(interruptible))
3453                                                 goto RetrySourceFault;
3454                                         /* fall thru */
3455                                 case VM_FAULT_INTERRUPTED:
3456                                         vm_fault_copy_dst_cleanup(dst_page);
3457                                         RETURN(MACH_SEND_INTERRUPTED);
3458                                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3459                                         vm_page_more_fictitious();
3460                                         goto RetrySourceFault;
3461                                 case VM_FAULT_MEMORY_ERROR:
3462                                         vm_fault_copy_dst_cleanup(dst_page);
3463                                         if (error)
3464                                                 return (error);
3465                                         else
3466                                                 return(KERN_MEMORY_ERROR);
3467                                 }
3468
3469
3470                                 assert((src_top_page == VM_PAGE_NULL) ==
3471                                        (result_page->object == src_object));
3472                         }
3473                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3474                         vm_object_unlock(result_page->object);
3475                 }
3476
3477                 if (!vm_map_verify(dst_map, dst_version)) {
3478                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3479                                 vm_fault_copy_cleanup(result_page, src_top_page);
3480                         vm_fault_copy_dst_cleanup(dst_page);
3481                         break;
3482                 }
3483
3484                 vm_object_lock(dst_page->object);
3485
3486                 if (dst_page->object->copy != old_copy_object) {
3487                         vm_object_unlock(dst_page->object);
3488                         vm_map_verify_done(dst_map, dst_version);
3489                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3490                                 vm_fault_copy_cleanup(result_page, src_top_page);
3491                         vm_fault_copy_dst_cleanup(dst_page);
3492                         break;
3493                 }
3494                 vm_object_unlock(dst_page->object);
3495
3496                 /*
3497                  *      Copy the page, and note that it is dirty
3498                  *      immediately.
3499                  */
3500
3501                 if (!page_aligned(src_offset) ||
3502                         !page_aligned(dst_offset) ||
3503                         !page_aligned(amount_left)) {
3504
3505                         vm_object_offset_t      src_po,
3506                                                 dst_po;
3507
3508                         src_po = src_offset - trunc_page_64(src_offset);
3509                         dst_po = dst_offset - trunc_page_64(dst_offset);
3510
3511                         if (dst_po > src_po) {
3512                                 part_size = PAGE_SIZE - dst_po;
3513                         } else {
3514                                 part_size = PAGE_SIZE - src_po;
3515                         }
3516                         if (part_size > (amount_left)){
3517                                 part_size = amount_left;
3518                         }
3519
3520                         if (result_page == VM_PAGE_NULL) {
3521                                 vm_page_part_zero_fill(dst_page,
3522                                                         dst_po, part_size);
3523                         } else {
3524                                 vm_page_part_copy(result_page, src_po,
3525                                         dst_page, dst_po, part_size);
3526                                 if(!dst_page->dirty){
3527                                         vm_object_lock(dst_object);
3528                                         dst_page->dirty = TRUE;
3529                                         vm_object_unlock(dst_page->object);
3530                                 }
3531
3532                         }
3533                 } else {
3534                         part_size = PAGE_SIZE;
3535
3536                         if (result_page == VM_PAGE_NULL)
3537                                 vm_page_zero_fill(dst_page);
3538                         else{
3539                                 vm_page_copy(result_page, dst_page);
3540                                 if(!dst_page->dirty){
3541                                         vm_object_lock(dst_object);
3542                                         dst_page->dirty = TRUE;
3543                                         vm_object_unlock(dst_page->object);
3544                                 }
3545                         }
3546
3547                 }
3548
3549                 /*
3550                  *      Unlock everything, and return
3551                  */
3552
3553                 vm_map_verify_done(dst_map, dst_version);
3554
3555                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3556                         vm_fault_copy_cleanup(result_page, src_top_page);
3557                 vm_fault_copy_dst_cleanup(dst_page);
3558
3559                 amount_left -= part_size;
3560                 src_offset += part_size;
3561                 dst_offset += part_size;
3562         } while (amount_left > 0);
3563
3564         RETURN(KERN_SUCCESS);
3565 #undef  RETURN
3566
3567         /*NOTREACHED*/
3568 }
3569
3570 #ifdef  notdef
3571
3572 /*
3573  *      Routine:        vm_fault_page_overwrite
3574  *
3575  *      Description:
3576  *              A form of vm_fault_page that assumes that the
3577  *              resulting page will be overwritten in its entirety,
3578  *              making it unnecessary to obtain the correct *contents*
3579  *              of the page.
3580  *
3581  *      Implementation:
3582  *              XXX Untested.  Also unused.  Eventually, this technology
3583  *              could be used in vm_fault_copy() to advantage.
3584  */
3585 vm_fault_return_t
3586 vm_fault_page_overwrite(
3587         register
3588         vm_object_t             dst_object,
3589         vm_object_offset_t      dst_offset,
3590         vm_page_t               *result_page)   /* OUT */
3591 {
3592         register
3593         vm_page_t       dst_page;
3594         kern_return_t   wait_result;
3595
3596 #define interruptible   THREAD_UNINT    /* XXX */
3597
3598         while (TRUE) {
3599                 /*
3600                  *      Look for a page at this offset
3601                  */
3602
3603                 while ((dst_page = vm_page_lookup(dst_object, dst_offset))
3604                                  == VM_PAGE_NULL) {
3605                         /*
3606                          *      No page, no problem... just allocate one.
3607                          */
3608
3609                         dst_page = vm_page_alloc(dst_object, dst_offset);
3610                         if (dst_page == VM_PAGE_NULL) {
3611                                 vm_object_unlock(dst_object);
3612                                 VM_PAGE_WAIT();
3613                                 vm_object_lock(dst_object);
3614                                 continue;
3615                         }
3616
3617                         /*
3618                          *      Pretend that the memory manager
3619                          *      write-protected the page.
3620                          *
3621                          *      Note that we will be asking for write
3622                          *      permission without asking for the data
3623                          *      first.
3624                          */
3625
3626                         dst_page->overwriting = TRUE;
3627                         dst_page->page_lock = VM_PROT_WRITE;
3628                         dst_page->absent = TRUE;
3629                         dst_page->unusual = TRUE;
3630                         dst_object->absent_count++;
3631
3632                         break;
3633
3634                         /*
3635                          *      When we bail out, we might have to throw
3636                          *      away the page created here.
3637                          */
3638
3639 #define DISCARD_PAGE                                            \
3640         MACRO_BEGIN                                             \
3641         vm_object_lock(dst_object);                             \
3642         dst_page = vm_page_lookup(dst_object, dst_offset);      \
3643         if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \
3644                 VM_PAGE_FREE(dst_page);                         \
3645         vm_object_unlock(dst_object);                           \
3646         MACRO_END
3647                 }
3648
3649                 /*
3650                  *      If the page is write-protected...
3651                  */
3652
3653                 if (dst_page->page_lock & VM_PROT_WRITE) {
3654                         /*
3655                          *      ... and an unlock request hasn't been sent
3656                          */
3657
3658                         if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) {
3659                                 vm_prot_t       u;
3660                                 kern_return_t   rc;
3661
3662                                 /*
3663                                  *      ... then send one now.
3664                                  */
3665
3666                                 if (!dst_object->pager_ready) {
3667                                         vm_object_assert_wait(dst_object,
3668                                                 VM_OBJECT_EVENT_PAGER_READY,
3669                                                 interruptible);
3670                                         vm_object_unlock(dst_object);
3671                                         wait_result = thread_block((void (*)(void))0);
3672                                         if (wait_result != THREAD_AWAKENED) {
3673                                                 DISCARD_PAGE;
3674                                                 return(VM_FAULT_INTERRUPTED);
3675                                         }
3676                                         continue;
3677                                 }
3678
3679                                 u = dst_page->unlock_request |= VM_PROT_WRITE;
3680                                 vm_object_unlock(dst_object);
3681
3682                                 if ((rc = memory_object_data_unlock(
3683                                                 dst_object->pager,
3684                                                 dst_offset + dst_object->paging_offset,
3685                                                 PAGE_SIZE,
3686                                                 u)) != KERN_SUCCESS) {
3687                                         if (vm_fault_debug)
3688                                             printf("vm_object_overwrite: memory_object_data_unlock failed\n");
3689                                         DISCARD_PAGE;
3690                                         return((rc == MACH_SEND_INTERRUPTED) ?
3691                                                 VM_FAULT_INTERRUPTED :
3692                                                 VM_FAULT_MEMORY_ERROR);
3693                                 }
3694                                 vm_object_lock(dst_object);
3695                                 continue;
3696                         }
3697
3698                         /* ... fall through to wait below */
3699                 } else {
3700                         /*
3701                          *      If the page isn't being used for other
3702                          *      purposes, then we're done.
3703                          */
3704                         if ( ! (dst_page->busy || dst_page->absent ||
3705                                 dst_page->error || dst_page->restart) )
3706                                 break;
3707                 }
3708
3709                 PAGE_ASSERT_WAIT(dst_page, interruptible);
3710                 vm_object_unlock(dst_object);
3711                 wait_result = thread_block((void (*)(void))0);
3712                 if (wait_result != THREAD_AWAKENED) {
3713                         DISCARD_PAGE;
3714                         return(VM_FAULT_INTERRUPTED);
3715                 }
3716         }
3717
3718         *result_page = dst_page;
3719         return(VM_FAULT_SUCCESS);
3720
3721 #undef  interruptible
3722 #undef  DISCARD_PAGE
3723 }
3724
3725 #endif  /* notdef */
3726
3727 #if     VM_FAULT_CLASSIFY
3728 /*
3729  *      Temporary statistics gathering support.
3730  */
3731
3732 /*
3733  *      Statistics arrays:
3734  */
3735 #define VM_FAULT_TYPES_MAX      5
3736 #define VM_FAULT_LEVEL_MAX      8
3737
3738 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
3739
3740 #define VM_FAULT_TYPE_ZERO_FILL 0
3741 #define VM_FAULT_TYPE_MAP_IN    1
3742 #define VM_FAULT_TYPE_PAGER     2
3743 #define VM_FAULT_TYPE_COPY      3
3744 #define VM_FAULT_TYPE_OTHER     4
3745
3746
3747 void
3748 vm_fault_classify(vm_object_t           object,
3749                   vm_object_offset_t    offset,
3750                   vm_prot_t             fault_type)
3751 {
3752         int             type, level = 0;
3753         vm_page_t       m;
3754
3755         while (TRUE) {
3756                 m = vm_page_lookup(object, offset);
3757                 if (m != VM_PAGE_NULL) {
3758                         if (m->busy || m->error || m->restart || m->absent ||
3759                             fault_type & m->page_lock) {
3760                                 type = VM_FAULT_TYPE_OTHER;
3761                                 break;
3762                         }
3763                         if (((fault_type & VM_PROT_WRITE) == 0) ||
3764                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
3765                                 type = VM_FAULT_TYPE_MAP_IN;
3766                                 break;
3767                         }
3768                         type = VM_FAULT_TYPE_COPY;
3769                         break;
3770                 }
3771                 else {
3772                         if (object->pager_created) {
3773                                 type = VM_FAULT_TYPE_PAGER;
3774                                 break;
3775                         }
3776                         if (object->shadow == VM_OBJECT_NULL) {
3777                                 type = VM_FAULT_TYPE_ZERO_FILL;
3778                                 break;
3779                         }
3780
3781                         offset += object->shadow_offset;
3782                         object = object->shadow;
3783                         level++;
3784                         continue;
3785                 }
3786         }
3787
3788         if (level > VM_FAULT_LEVEL_MAX)
3789                 level = VM_FAULT_LEVEL_MAX;
3790
3791         vm_fault_stats[type][level] += 1;
3792
3793         return;
3794 }
3795
3796 /* cleanup routine to call from debugger */
3797
3798 void
3799 vm_fault_classify_init(void)
3800 {
3801         int type, level;
3802
3803         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
3804                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
3805                         vm_fault_stats[type][level] = 0;
3806                 }
3807         }
3808
3809         return;
3810 }
3811 #endif  /* VM_FAULT_CLASSIFY */