osfmk/vm/vm_fault.c

   1
   2 /*
   3  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   4  *
   5  * @APPLE_LICENSE_HEADER_START@
   6  *
   7  * The contents of this file constitute Original Code as defined in and
   8  * are subject to the Apple Public Source License Version 1.1 (the
   9  * "License").  You may not use this file except in compliance with the
  10  * License.  Please obtain a copy of the License at
  11  * http://www.apple.com/publicsource and read it before using this file.
  12  *
  13  * This Original Code and all software distributed under the License are
  14  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  18  * License for the specific language governing rights and limitations
  19  * under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 /*
  24  * @OSF_COPYRIGHT@
  25  */
  26 /*
  27  * Mach Operating System
  28  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  29  * All Rights Reserved.
  30  *
  31  * Permission to use, copy, modify and distribute this software and its
  32  * documentation is hereby granted, provided that both the copyright
  33  * notice and this permission notice appear in all copies of the
  34  * software, derivative works or modified versions, and any portions
  35  * thereof, and that both notices appear in supporting documentation.
  36  *
  37  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  38  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  39  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  40  *
  41  * Carnegie Mellon requests users of this software to return to
  42  *
  43  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  44  *  School of Computer Science
  45  *  Carnegie Mellon University
  46  *  Pittsburgh PA 15213-3890
  47  *
  48  * any improvements or extensions that they make and grant Carnegie Mellon
  49  * the rights to redistribute these changes.
  50  */
  51 /*
  52  */
  53 /*
  54  *      File:   vm_fault.c
  55  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  56  *
  57  *      Page fault handling module.
  58  */
  59 #ifdef MACH_BSD
  60 /* remove after component interface available */
  61 extern int      vnode_pager_workaround;
  62 extern int      device_pager_workaround;
  63 #endif
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <mach_kdb.h>
  68
  69 #include <vm/vm_fault.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <kern/host_statistics.h>
  73 #include <kern/counters.h>
  74 #include <kern/task.h>
  75 #include <kern/thread.h>
  76 #include <kern/sched_prim.h>
  77 #include <kern/host.h>
  78 #include <kern/xpr.h>
  79 #include <ppc/proc_reg.h>
  80 #include <ppc/pmap_internals.h>
  81 #include <vm/task_working_set.h>
  82 #include <vm/vm_map.h>
  83 #include <vm/vm_object.h>
  84 #include <vm/vm_page.h>
  85 #include <vm/pmap.h>
  86 #include <vm/vm_pageout.h>
  87 #include <mach/vm_param.h>
  88 #include <mach/vm_behavior.h>
  89 #include <mach/memory_object.h>
  90                                 /* For memory_object_data_{request,unlock} */
  91 #include <kern/mach_param.h>
  92 #include <kern/macro_help.h>
  93 #include <kern/zalloc.h>
  94 #include <kern/misc_protos.h>
  95
  96 #include <sys/kdebug.h>
  97
  98 #define VM_FAULT_CLASSIFY       0
  99 #define VM_FAULT_STATIC_CONFIG  1
 100
 101 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 102
 103 int             vm_object_absent_max = 50;
 104
 105 int             vm_fault_debug = 0;
 106 boolean_t       vm_page_deactivate_behind = TRUE;
 107
 108
 109 #if     !VM_FAULT_STATIC_CONFIG
 110 boolean_t       vm_fault_dirty_handling = FALSE;
 111 boolean_t       vm_fault_interruptible = FALSE;
 112 boolean_t       software_reference_bits = TRUE;
 113 #endif
 114
 115 #if     MACH_KDB
 116 extern struct db_watchpoint *db_watchpoint_list;
 117 #endif  /* MACH_KDB */
 118
 119 /* Forward declarations of internal routines. */
 120 extern kern_return_t vm_fault_wire_fast(
 121                                 vm_map_t        map,
 122                                 vm_offset_t     va,
 123                                 vm_map_entry_t  entry,
 124                                 pmap_t          pmap,
 125                                 vm_offset_t     pmap_addr);
 126
 127 extern void vm_fault_continue(void);
 128
 129 extern void vm_fault_copy_cleanup(
 130                                 vm_page_t       page,
 131                                 vm_page_t       top_page);
 132
 133 extern void vm_fault_copy_dst_cleanup(
 134                                 vm_page_t       page);
 135
 136 #if     VM_FAULT_CLASSIFY
 137 extern void vm_fault_classify(vm_object_t       object,
 138                           vm_object_offset_t    offset,
 139                           vm_prot_t             fault_type);
 140
 141 extern void vm_fault_classify_init(void);
 142 #endif
 143
 144 /*
 145  *      Routine:        vm_fault_init
 146  *      Purpose:
 147  *              Initialize our private data structures.
 148  */
 149 void
 150 vm_fault_init(void)
 151 {
 152 }
 153
 154 /*
 155  *      Routine:        vm_fault_cleanup
 156  *      Purpose:
 157  *              Clean up the result of vm_fault_page.
 158  *      Results:
 159  *              The paging reference for "object" is released.
 160  *              "object" is unlocked.
 161  *              If "top_page" is not null,  "top_page" is
 162  *              freed and the paging reference for the object
 163  *              containing it is released.
 164  *
 165  *      In/out conditions:
 166  *              "object" must be locked.
 167  */
 168 void
 169 vm_fault_cleanup(
 170         register vm_object_t    object,
 171         register vm_page_t      top_page)
 172 {
 173         vm_object_paging_end(object);
 174         vm_object_unlock(object);
 175
 176         if (top_page != VM_PAGE_NULL) {
 177             object = top_page->object;
 178             vm_object_lock(object);
 179             VM_PAGE_FREE(top_page);
 180             vm_object_paging_end(object);
 181             vm_object_unlock(object);
 182         }
 183 }
 184
 185 #if     MACH_CLUSTER_STATS
 186 #define MAXCLUSTERPAGES 16
 187 struct {
 188         unsigned long pages_in_cluster;
 189         unsigned long pages_at_higher_offsets;
 190         unsigned long pages_at_lower_offsets;
 191 } cluster_stats_in[MAXCLUSTERPAGES];
 192 #define CLUSTER_STAT(clause)    clause
 193 #define CLUSTER_STAT_HIGHER(x)  \
 194         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 195 #define CLUSTER_STAT_LOWER(x)   \
 196          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 197 #define CLUSTER_STAT_CLUSTER(x) \
 198         ((cluster_stats_in[(x)].pages_in_cluster)++)
 199 #else   /* MACH_CLUSTER_STATS */
 200 #define CLUSTER_STAT(clause)
 201 #endif  /* MACH_CLUSTER_STATS */
 202
 203 /* XXX - temporary */
 204 boolean_t vm_allow_clustered_pagein = FALSE;
 205 int vm_pagein_cluster_used = 0;
 206
 207 /*
 208  * Prepage default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 209  */
 210 int vm_default_ahead = 1;       /* Number of pages to prepage ahead */
 211 int vm_default_behind = 0;      /* Number of pages to prepage behind */
 212
 213 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 214
 215 /*
 216  *      Routine:        vm_fault_page
 217  *      Purpose:
 218  *              Find the resident page for the virtual memory
 219  *              specified by the given virtual memory object
 220  *              and offset.
 221  *      Additional arguments:
 222  *              The required permissions for the page is given
 223  *              in "fault_type".  Desired permissions are included
 224  *              in "protection".  The minimum and maximum valid offsets
 225  *              within the object for the relevant map entry are
 226  *              passed in "lo_offset" and "hi_offset" respectively and
 227  *              the expected page reference pattern is passed in "behavior".
 228  *              These three parameters are used to determine pagein cluster
 229  *              limits.
 230  *
 231  *              If the desired page is known to be resident (for
 232  *              example, because it was previously wired down), asserting
 233  *              the "unwiring" parameter will speed the search.
 234  *
 235  *              If the operation can be interrupted (by thread_abort
 236  *              or thread_terminate), then the "interruptible"
 237  *              parameter should be asserted.
 238  *
 239  *      Results:
 240  *              The page containing the proper data is returned
 241  *              in "result_page".
 242  *
 243  *      In/out conditions:
 244  *              The source object must be locked and referenced,
 245  *              and must donate one paging reference.  The reference
 246  *              is not affected.  The paging reference and lock are
 247  *              consumed.
 248  *
 249  *              If the call succeeds, the object in which "result_page"
 250  *              resides is left locked and holding a paging reference.
 251  *              If this is not the original object, a busy page in the
 252  *              original object is returned in "top_page", to prevent other
 253  *              callers from pursuing this same data, along with a paging
 254  *              reference for the original object.  The "top_page" should
 255  *              be destroyed when this guarantee is no longer required.
 256  *              The "result_page" is also left busy.  It is not removed
 257  *              from the pageout queues.
 258  */
 259
 260 vm_fault_return_t
 261 vm_fault_page(
 262         /* Arguments: */
 263         vm_object_t     first_object,   /* Object to begin search */
 264         vm_object_offset_t first_offset,        /* Offset into object */
 265         vm_prot_t       fault_type,     /* What access is requested */
 266         boolean_t       must_be_resident,/* Must page be resident? */
 267         int             interruptible,  /* how may fault be interrupted? */
 268         vm_object_offset_t lo_offset,   /* Map entry start */
 269         vm_object_offset_t hi_offset,   /* Map entry end */
 270         vm_behavior_t   behavior,       /* Page reference behavior */
 271         /* Modifies in place: */
 272         vm_prot_t       *protection,    /* Protection for mapping */
 273         /* Returns: */
 274         vm_page_t       *result_page,   /* Page found, if successful */
 275         vm_page_t       *top_page,      /* Page in top object, if
 276                                          * not result_page.  */
 277         int             *type_of_fault, /* if non-null, fill in with type of fault
 278                                          * COW, zero-fill, etc... returned in trace point */
 279         /* More arguments: */
 280         kern_return_t   *error_code,    /* code if page is in error */
 281         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 282         boolean_t       data_supply,    /* treat as data_supply if
 283                                          * it is a write fault and a full
 284                                          * page is provided */
 285         vm_map_t        map,
 286         vm_offset_t     vaddr)
 287 {
 288         register
 289         vm_page_t               m;
 290         register
 291         vm_object_t             object;
 292         register
 293         vm_object_offset_t      offset;
 294         vm_page_t               first_m;
 295         vm_object_t             next_object;
 296         vm_object_t             copy_object;
 297         boolean_t               look_for_page;
 298         vm_prot_t               access_required = fault_type;
 299         vm_prot_t               wants_copy_flag;
 300         vm_size_t               cluster_size, length;
 301         vm_object_offset_t      cluster_offset;
 302         vm_object_offset_t      cluster_start, cluster_end, paging_offset;
 303         vm_object_offset_t      align_offset;
 304         CLUSTER_STAT(int pages_at_higher_offsets;)
 305         CLUSTER_STAT(int pages_at_lower_offsets;)
 306         kern_return_t   wait_result;
 307         boolean_t               interruptible_state;
 308         boolean_t               bumped_pagein = FALSE;
 309
 310
 311 #if     MACH_PAGEMAP
 312 /*
 313  * MACH page map - an optional optimization where a bit map is maintained
 314  * by the VM subsystem for internal objects to indicate which pages of
 315  * the object currently reside on backing store.  This existence map
 316  * duplicates information maintained by the vnode pager.  It is
 317  * created at the time of the first pageout against the object, i.e.
 318  * at the same time pager for the object is created.  The optimization
 319  * is designed to eliminate pager interaction overhead, if it is
 320  * 'known' that the page does not exist on backing store.
 321  *
 322  * LOOK_FOR() evaluates to TRUE if the page specified by object/offset is
 323  * either marked as paged out in the existence map for the object or no
 324  * existence map exists for the object.  LOOK_FOR() is one of the
 325  * criteria in the decision to invoke the pager.   It is also used as one
 326  * of the criteria to terminate the scan for adjacent pages in a clustered
 327  * pagein operation.  Note that LOOK_FOR() always evaluates to TRUE for
 328  * permanent objects.  Note also that if the pager for an internal object
 329  * has not been created, the pager is not invoked regardless of the value
 330  * of LOOK_FOR() and that clustered pagein scans are only done on an object
 331  * for which a pager has been created.
 332  *
 333  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 334  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 335  * PAGED_OUT() is used to determine if a page has already been pushed
 336  * into a copy object in order to avoid a redundant page out operation.
 337  */
 338 #define LOOK_FOR(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 339                         != VM_EXTERNAL_STATE_ABSENT)
 340 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 341                         == VM_EXTERNAL_STATE_EXISTS)
 342 #else /* MACH_PAGEMAP */
 343 /*
 344  * If the MACH page map optimization is not enabled,
 345  * LOOK_FOR() always evaluates to TRUE.  The pager will always be
 346  * invoked to resolve missing pages in an object, assuming the pager
 347  * has been created for the object.  In a clustered page operation, the
 348  * absence of a page on backing backing store cannot be used to terminate
 349  * a scan for adjacent pages since that information is available only in
 350  * the pager.  Hence pages that may not be paged out are potentially
 351  * included in a clustered request.  The vnode pager is coded to deal
 352  * with any combination of absent/present pages in a clustered
 353  * pagein request.  PAGED_OUT() always evaluates to FALSE, i.e. the pager
 354  * will always be invoked to push a dirty page into a copy object assuming
 355  * a pager has been created.  If the page has already been pushed, the
 356  * pager will ingore the new request.
 357  */
 358 #define LOOK_FOR(o, f) TRUE
 359 #define PAGED_OUT(o, f) FALSE
 360 #endif /* MACH_PAGEMAP */
 361
 362 /*
 363  *      Recovery actions
 364  */
 365 #define PREPARE_RELEASE_PAGE(m)                         \
 366         MACRO_BEGIN                                     \
 367         vm_page_lock_queues();                          \
 368         MACRO_END
 369
 370 #define DO_RELEASE_PAGE(m)                              \
 371         MACRO_BEGIN                                     \
 372         PAGE_WAKEUP_DONE(m);                            \
 373         if (!m->active && !m->inactive)                 \
 374                 vm_page_activate(m);                    \
 375         vm_page_unlock_queues();                        \
 376         MACRO_END
 377
 378 #define RELEASE_PAGE(m)                                 \
 379         MACRO_BEGIN                                     \
 380         PREPARE_RELEASE_PAGE(m);                        \
 381         DO_RELEASE_PAGE(m);                             \
 382         MACRO_END
 383
 384 #if TRACEFAULTPAGE
 385         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 386 #endif
 387
 388
 389
 390 #if     !VM_FAULT_STATIC_CONFIG
 391         if (vm_fault_dirty_handling
 392 #if     MACH_KDB
 393                 /*
 394                  *      If there are watchpoints set, then
 395                  *      we don't want to give away write permission
 396                  *      on a read fault.  Make the task write fault,
 397                  *      so that the watchpoint code notices the access.
 398                  */
 399             || db_watchpoint_list
 400 #endif  /* MACH_KDB */
 401             ) {
 402                 /*
 403                  *      If we aren't asking for write permission,
 404                  *      then don't give it away.  We're using write
 405                  *      faults to set the dirty bit.
 406                  */
 407                 if (!(fault_type & VM_PROT_WRITE))
 408                         *protection &= ~VM_PROT_WRITE;
 409         }
 410
 411         if (!vm_fault_interruptible)
 412                 interruptible = THREAD_UNINT;
 413 #else   /* STATIC_CONFIG */
 414 #if     MACH_KDB
 415                 /*
 416                  *      If there are watchpoints set, then
 417                  *      we don't want to give away write permission
 418                  *      on a read fault.  Make the task write fault,
 419                  *      so that the watchpoint code notices the access.
 420                  */
 421             if (db_watchpoint_list) {
 422                 /*
 423                  *      If we aren't asking for write permission,
 424                  *      then don't give it away.  We're using write
 425                  *      faults to set the dirty bit.
 426                  */
 427                 if (!(fault_type & VM_PROT_WRITE))
 428                         *protection &= ~VM_PROT_WRITE;
 429         }
 430
 431 #endif  /* MACH_KDB */
 432 #endif  /* STATIC_CONFIG */
 433
 434         interruptible_state = thread_interrupt_level(interruptible);
 435
 436         /*
 437          *      INVARIANTS (through entire routine):
 438          *
 439          *      1)      At all times, we must either have the object
 440          *              lock or a busy page in some object to prevent
 441          *              some other thread from trying to bring in
 442          *              the same page.
 443          *
 444          *              Note that we cannot hold any locks during the
 445          *              pager access or when waiting for memory, so
 446          *              we use a busy page then.
 447          *
 448          *              Note also that we aren't as concerned about more than
 449          *              one thread attempting to memory_object_data_unlock
 450          *              the same page at once, so we don't hold the page
 451          *              as busy then, but do record the highest unlock
 452          *              value so far.  [Unlock requests may also be delivered
 453          *              out of order.]
 454          *
 455          *      2)      To prevent another thread from racing us down the
 456          *              shadow chain and entering a new page in the top
 457          *              object before we do, we must keep a busy page in
 458          *              the top object while following the shadow chain.
 459          *
 460          *      3)      We must increment paging_in_progress on any object
 461          *              for which we have a busy page
 462          *
 463          *      4)      We leave busy pages on the pageout queues.
 464          *              If the pageout daemon comes across a busy page,
 465          *              it will remove the page from the pageout queues.
 466          */
 467
 468         /*
 469          *      Search for the page at object/offset.
 470          */
 471
 472         object = first_object;
 473         offset = first_offset;
 474         first_m = VM_PAGE_NULL;
 475         access_required = fault_type;
 476
 477         XPR(XPR_VM_FAULT,
 478                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 479                 (integer_t)object, offset, fault_type, *protection, 0);
 480
 481         /*
 482          *      See whether this page is resident
 483          */
 484
 485         while (TRUE) {
 486 #if TRACEFAULTPAGE
 487                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 488 #endif
 489                 if (!object->alive) {
 490                         vm_fault_cleanup(object, first_m);
 491                         thread_interrupt_level(interruptible_state);
 492                         return(VM_FAULT_MEMORY_ERROR);
 493                 }
 494                 m = vm_page_lookup(object, offset);
 495 #if TRACEFAULTPAGE
 496                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 497 #endif
 498                 if (m != VM_PAGE_NULL) {
 499                         /*
 500                          *      If the page was pre-paged as part of a
 501                          *      cluster, record the fact.
 502                          */
 503                         if (m->clustered) {
 504                                 vm_pagein_cluster_used++;
 505                                 m->clustered = FALSE;
 506                         }
 507
 508                         /*
 509                          *      If the page is being brought in,
 510                          *      wait for it and then retry.
 511                          *
 512                          *      A possible optimization: if the page
 513                          *      is known to be resident, we can ignore
 514                          *      pages that are absent (regardless of
 515                          *      whether they're busy).
 516                          */
 517
 518                         if (m->busy) {
 519 #if TRACEFAULTPAGE
 520                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 521 #endif
 522                                 wait_result = PAGE_SLEEP(object, m, interruptible);
 523                                 XPR(XPR_VM_FAULT,
 524                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 525                                         (integer_t)object, offset,
 526                                         (integer_t)m, 0, 0);
 527                                 counter(c_vm_fault_page_block_busy_kernel++);
 528
 529                                 if (wait_result != THREAD_AWAKENED) {
 530                                         vm_fault_cleanup(object, first_m);
 531                                         thread_interrupt_level(interruptible_state);
 532                                         if (wait_result == THREAD_RESTART)
 533                                           {
 534                                                 return(VM_FAULT_RETRY);
 535                                           }
 536                                         else
 537                                           {
 538                                                 return(VM_FAULT_INTERRUPTED);
 539                                           }
 540                                 }
 541                                 continue;
 542                         }
 543
 544                         /*
 545                          *      If the page is in error, give up now.
 546                          */
 547
 548                         if (m->error) {
 549 #if TRACEFAULTPAGE
 550                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
 551 #endif
 552                                 if (error_code)
 553                                         *error_code = m->page_error;
 554                                 VM_PAGE_FREE(m);
 555                                 vm_fault_cleanup(object, first_m);
 556                                 thread_interrupt_level(interruptible_state);
 557                                 return(VM_FAULT_MEMORY_ERROR);
 558                         }
 559
 560                         /*
 561                          *      If the pager wants us to restart
 562                          *      at the top of the chain,
 563                          *      typically because it has moved the
 564                          *      page to another pager, then do so.
 565                          */
 566
 567                         if (m->restart) {
 568 #if TRACEFAULTPAGE
 569                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 570 #endif
 571                                 VM_PAGE_FREE(m);
 572                                 vm_fault_cleanup(object, first_m);
 573                                 thread_interrupt_level(interruptible_state);
 574                                 return(VM_FAULT_RETRY);
 575                         }
 576
 577                         /*
 578                          *      If the page isn't busy, but is absent,
 579                          *      then it was deemed "unavailable".
 580                          */
 581
 582                         if (m->absent) {
 583                                 /*
 584                                  * Remove the non-existent page (unless it's
 585                                  * in the top object) and move on down to the
 586                                  * next object (if there is one).
 587                                  */
 588 #if TRACEFAULTPAGE
 589                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
 590 #endif
 591
 592                                 next_object = object->shadow;
 593                                 if (next_object == VM_OBJECT_NULL) {
 594                                         vm_page_t real_m;
 595
 596                                         assert(!must_be_resident);
 597
 598                                         if (object->shadow_severed) {
 599                                                 vm_fault_cleanup(
 600                                                         object, first_m);
 601                                                 thread_interrupt_level(interruptible_state);
 602                                                 return VM_FAULT_MEMORY_ERROR;
 603                                         }
 604
 605                                         /*
 606                                          * Absent page at bottom of shadow
 607                                          * chain; zero fill the page we left
 608                                          * busy in the first object, and flush
 609                                          * the absent page.  But first we
 610                                          * need to allocate a real page.
 611                                          */
 612                                         if (VM_PAGE_THROTTLED() ||
 613                                             (real_m = vm_page_grab()) == VM_PAGE_NULL) {
 614                                                 vm_fault_cleanup(object, first_m);
 615                                                 thread_interrupt_level(interruptible_state);
 616                                                 return(VM_FAULT_MEMORY_SHORTAGE);
 617                                         }
 618
 619                                         XPR(XPR_VM_FAULT,
 620               "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
 621                                                 (integer_t)object, offset,
 622                                                 (integer_t)m,
 623                                                 (integer_t)first_object, 0);
 624                                         if (object != first_object) {
 625                                                 VM_PAGE_FREE(m);
 626                                                 vm_object_paging_end(object);
 627                                                 vm_object_unlock(object);
 628                                                 object = first_object;
 629                                                 offset = first_offset;
 630                                                 m = first_m;
 631                                                 first_m = VM_PAGE_NULL;
 632                                                 vm_object_lock(object);
 633                                         }
 634
 635                                         VM_PAGE_FREE(m);
 636                                         assert(real_m->busy);
 637                                         vm_page_insert(real_m, object, offset);
 638                                         m = real_m;
 639
 640                                         /*
 641                                          *  Drop the lock while zero filling
 642                                          *  page.  Then break because this
 643                                          *  is the page we wanted.  Checking
 644                                          *  the page lock is a waste of time;
 645                                          *  this page was either absent or
 646                                          *  newly allocated -- in both cases
 647                                          *  it can't be page locked by a pager.
 648                                          */
 649                                         m->no_isync = FALSE;
 650
 651                                         if (!no_zero_fill) {
 652                                                 vm_object_unlock(object);
 653                                                 vm_page_zero_fill(m);
 654                                                 if (type_of_fault)
 655                                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
 656                                                 VM_STAT(zero_fill_count++);
 657
 658                                                 if (bumped_pagein == TRUE) {
 659                                                         VM_STAT(pageins--);
 660                                                         current_task()->pageins--;
 661                                                 }
 662                                                 vm_object_lock(object);
 663                                         }
 664                                         pmap_clear_modify(m->phys_addr);
 665                                         vm_page_lock_queues();
 666                                         VM_PAGE_QUEUES_REMOVE(m);
 667                                         m->page_ticket = vm_page_ticket;
 668                                         if(m->object->size > 0x80000) {
 669                                                 m->zero_fill = TRUE;
 670                                                 /* depends on the queues lock */
 671                                                 vm_zf_count += 1;
 672                                                 queue_enter(&vm_page_queue_zf,
 673                                                         m, vm_page_t, pageq);
 674                                         } else {
 675                                                 queue_enter(
 676                                                         &vm_page_queue_inactive,
 677                                                         m, vm_page_t, pageq);
 678                                         }
 679                                         vm_page_ticket_roll++;
 680                                         if(vm_page_ticket_roll ==
 681                                                 VM_PAGE_TICKETS_IN_ROLL) {
 682                                                 vm_page_ticket_roll = 0;
 683                                                 if(vm_page_ticket ==
 684                                                      VM_PAGE_TICKET_ROLL_IDS)
 685                                                         vm_page_ticket= 0;
 686                                                 else
 687                                                         vm_page_ticket++;
 688                                         }
 689                                         m->inactive = TRUE;
 690                                         vm_page_inactive_count++;
 691                                         vm_page_unlock_queues();
 692                                         break;
 693                                 } else {
 694                                         if (must_be_resident) {
 695                                                 vm_object_paging_end(object);
 696                                         } else if (object != first_object) {
 697                                                 vm_object_paging_end(object);
 698                                                 VM_PAGE_FREE(m);
 699                                         } else {
 700                                                 first_m = m;
 701                                                 m->absent = FALSE;
 702                                                 m->unusual = FALSE;
 703                                                 vm_object_absent_release(object);
 704                                                 m->busy = TRUE;
 705
 706                                                 vm_page_lock_queues();
 707                                                 VM_PAGE_QUEUES_REMOVE(m);
 708                                                 vm_page_unlock_queues();
 709                                         }
 710                                         XPR(XPR_VM_FAULT,
 711                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
 712                                                 (integer_t)object, offset,
 713                                                 (integer_t)next_object,
 714                                                 offset+object->shadow_offset,0);
 715                                         offset += object->shadow_offset;
 716                                         hi_offset += object->shadow_offset;
 717                                         lo_offset += object->shadow_offset;
 718                                         access_required = VM_PROT_READ;
 719                                         vm_object_lock(next_object);
 720                                         vm_object_unlock(object);
 721                                         object = next_object;
 722                                         vm_object_paging_begin(object);
 723                                         continue;
 724                                 }
 725                         }
 726
 727                         if ((m->cleaning)
 728                                 && ((object != first_object) ||
 729                                     (object->copy != VM_OBJECT_NULL))
 730                                 && (fault_type & VM_PROT_WRITE)) {
 731                                 /*
 732                                  * This is a copy-on-write fault that will
 733                                  * cause us to revoke access to this page, but
 734                                  * this page is in the process of being cleaned
 735                                  * in a clustered pageout. We must wait until
 736                                  * the cleaning operation completes before
 737                                  * revoking access to the original page,
 738                                  * otherwise we might attempt to remove a
 739                                  * wired mapping.
 740                                  */
 741 #if TRACEFAULTPAGE
 742                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
 743 #endif
 744                                 XPR(XPR_VM_FAULT,
 745                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
 746                                         (integer_t)object, offset,
 747                                         (integer_t)m, 0, 0);
 748                                 /* take an extra ref so that object won't die */
 749                                 assert(object->ref_count > 0);
 750                                 object->ref_count++;
 751                                 vm_object_res_reference(object);
 752                                 vm_fault_cleanup(object, first_m);
 753                                 counter(c_vm_fault_page_block_backoff_kernel++);
 754                                 vm_object_lock(object);
 755                                 assert(object->ref_count > 0);
 756                                 m = vm_page_lookup(object, offset);
 757                                 if (m != VM_PAGE_NULL && m->cleaning) {
 758                                         PAGE_ASSERT_WAIT(m, interruptible);
 759                                         vm_object_unlock(object);
 760                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
 761                                         vm_object_deallocate(object);
 762                                         goto backoff;
 763                                 } else {
 764                                         vm_object_unlock(object);
 765                                         vm_object_deallocate(object);
 766                                         thread_interrupt_level(interruptible_state);
 767                                         return VM_FAULT_RETRY;
 768                                 }
 769                         }
 770
 771                         /*
 772                          *      If the desired access to this page has
 773                          *      been locked out, request that it be unlocked.
 774                          */
 775
 776                         if (access_required & m->page_lock) {
 777                                 if ((access_required & m->unlock_request) != access_required) {
 778                                         vm_prot_t       new_unlock_request;
 779                                         kern_return_t   rc;
 780
 781 #if TRACEFAULTPAGE
 782                                         dbgTrace(0xBEEF000A, (unsigned int) m, (unsigned int) object->pager_ready);     /* (TEST/DEBUG) */
 783 #endif
 784                                         if (!object->pager_ready) {
 785                                         XPR(XPR_VM_FAULT,
 786                                             "vm_f_page: ready wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 787                                                 access_required,
 788                                                 (integer_t)object, offset,
 789                                                 (integer_t)m, 0);
 790                                                 /* take an extra ref */
 791                                                 assert(object->ref_count > 0);
 792                                                 object->ref_count++;
 793                                                 vm_object_res_reference(object);
 794                                                 vm_fault_cleanup(object,
 795                                                                  first_m);
 796                                                 counter(c_vm_fault_page_block_backoff_kernel++);
 797                                                 vm_object_lock(object);
 798                                                 assert(object->ref_count > 0);
 799                                                 if (!object->pager_ready) {
 800                                                         wait_result = vm_object_assert_wait(
 801                                                                 object,
 802                                                                 VM_OBJECT_EVENT_PAGER_READY,
 803                                                                 interruptible);
 804                                                         vm_object_unlock(object);
 805                                                         if (wait_result == THREAD_WAITING)
 806                                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
 807                                                         vm_object_deallocate(object);
 808                                                         goto backoff;
 809                                                 } else {
 810                                                         vm_object_unlock(object);
 811                                                         vm_object_deallocate(object);
 812                                                         thread_interrupt_level(interruptible_state);
 813                                                         return VM_FAULT_RETRY;
 814                                                 }
 815                                         }
 816
 817                                         new_unlock_request = m->unlock_request =
 818                                                 (access_required | m->unlock_request);
 819                                         vm_object_unlock(object);
 820                                         XPR(XPR_VM_FAULT,
 821                                             "vm_f_page: unlock obj 0x%X, offset 0x%X, page 0x%X, unl_req %d\n",
 822                                         (integer_t)object, offset,
 823                                         (integer_t)m, new_unlock_request, 0);
 824                                         if ((rc = memory_object_data_unlock(
 825                                                 object->pager,
 826                                                 offset + object->paging_offset,
 827                                                 PAGE_SIZE,
 828                                                 new_unlock_request))
 829                                              != KERN_SUCCESS) {
 830                                                 if (vm_fault_debug)
 831                                                     printf("vm_fault: memory_object_data_unlock failed\n");
 832                                                 vm_object_lock(object);
 833                                                 vm_fault_cleanup(object, first_m);
 834                                                 thread_interrupt_level(interruptible_state);
 835                                                 return((rc == MACH_SEND_INTERRUPTED) ?
 836                                                         VM_FAULT_INTERRUPTED :
 837                                                         VM_FAULT_MEMORY_ERROR);
 838                                         }
 839                                         vm_object_lock(object);
 840                                         continue;
 841                                 }
 842
 843                                 XPR(XPR_VM_FAULT,
 844         "vm_f_page: access wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 845                                         access_required, (integer_t)object,
 846                                         offset, (integer_t)m, 0);
 847                                 /* take an extra ref so object won't die */
 848                                 assert(object->ref_count > 0);
 849                                 object->ref_count++;
 850                                 vm_object_res_reference(object);
 851                                 vm_fault_cleanup(object, first_m);
 852                                 counter(c_vm_fault_page_block_backoff_kernel++);
 853                                 vm_object_lock(object);
 854                                 assert(object->ref_count > 0);
 855                                 m = vm_page_lookup(object, offset);
 856                                 if (m != VM_PAGE_NULL &&
 857                                     (access_required & m->page_lock) &&
 858                                     !((access_required & m->unlock_request) != access_required)) {
 859                                         PAGE_ASSERT_WAIT(m, interruptible);
 860                                         vm_object_unlock(object);
 861                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
 862                                         vm_object_deallocate(object);
 863                                         goto backoff;
 864                                 } else {
 865                                         vm_object_unlock(object);
 866                                         vm_object_deallocate(object);
 867                                         thread_interrupt_level(interruptible_state);
 868                                         return VM_FAULT_RETRY;
 869                                 }
 870                         }
 871                         /*
 872                          *      We mark the page busy and leave it on
 873                          *      the pageout queues.  If the pageout
 874                          *      deamon comes across it, then it will
 875                          *      remove the page.
 876                          */
 877
 878 #if TRACEFAULTPAGE
 879                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 880 #endif
 881
 882 #if     !VM_FAULT_STATIC_CONFIG
 883                         if (!software_reference_bits) {
 884                                 vm_page_lock_queues();
 885                                 if (m->inactive)
 886                                         vm_stat.reactivations++;
 887
 888                                 VM_PAGE_QUEUES_REMOVE(m);
 889                                 vm_page_unlock_queues();
 890                         }
 891 #endif
 892                         XPR(XPR_VM_FAULT,
 893                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
 894                                 (integer_t)object, offset, (integer_t)m, 0, 0);
 895                         assert(!m->busy);
 896                         m->busy = TRUE;
 897                         assert(!m->absent);
 898                         break;
 899                 }
 900
 901                 look_for_page =
 902                         (object->pager_created) &&
 903                           LOOK_FOR(object, offset) &&
 904                             (!data_supply);
 905
 906 #if TRACEFAULTPAGE
 907                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
 908 #endif
 909                 if ((look_for_page || (object == first_object))
 910                                 && !must_be_resident
 911                                 && !(object->phys_contiguous))  {
 912                         /*
 913                          *      Allocate a new page for this object/offset
 914                          *      pair.
 915                          */
 916
 917                         m = vm_page_grab_fictitious();
 918 #if TRACEFAULTPAGE
 919                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 920 #endif
 921                         if (m == VM_PAGE_NULL) {
 922                                 vm_fault_cleanup(object, first_m);
 923                                 thread_interrupt_level(interruptible_state);
 924                                 return(VM_FAULT_FICTITIOUS_SHORTAGE);
 925                         }
 926                         vm_page_insert(m, object, offset);
 927                 }
 928
 929                 if ((look_for_page && !must_be_resident)) {
 930                         kern_return_t   rc;
 931
 932                         /*
 933                          *      If the memory manager is not ready, we
 934                          *      cannot make requests.
 935                          */
 936                         if (!object->pager_ready) {
 937 #if TRACEFAULTPAGE
 938                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 939 #endif
 940                                 if(m != VM_PAGE_NULL)
 941                                         VM_PAGE_FREE(m);
 942                                 XPR(XPR_VM_FAULT,
 943                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
 944                                         (integer_t)object, offset, 0, 0, 0);
 945                                 /* take an extra ref so object won't die */
 946                                 assert(object->ref_count > 0);
 947                                 object->ref_count++;
 948                                 vm_object_res_reference(object);
 949                                 vm_fault_cleanup(object, first_m);
 950                                 counter(c_vm_fault_page_block_backoff_kernel++);
 951                                 vm_object_lock(object);
 952                                 assert(object->ref_count > 0);
 953                                 if (!object->pager_ready) {
 954                                         wait_result = vm_object_assert_wait(object,
 955                                                               VM_OBJECT_EVENT_PAGER_READY,
 956                                                               interruptible);
 957                                         vm_object_unlock(object);
 958                                         if (wait_result == THREAD_WAITING)
 959                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
 960                                         vm_object_deallocate(object);
 961                                         goto backoff;
 962                                 } else {
 963                                         vm_object_unlock(object);
 964                                         vm_object_deallocate(object);
 965                                         thread_interrupt_level(interruptible_state);
 966                                         return VM_FAULT_RETRY;
 967                                 }
 968                         }
 969
 970                         if(object->phys_contiguous) {
 971                                 if(m != VM_PAGE_NULL) {
 972                                         VM_PAGE_FREE(m);
 973                                         m = VM_PAGE_NULL;
 974                                 }
 975                                 goto no_clustering;
 976                         }
 977                         if (object->internal) {
 978                                 /*
 979                                  *      Requests to the default pager
 980                                  *      must reserve a real page in advance,
 981                                  *      because the pager's data-provided
 982                                  *      won't block for pages.  IMPORTANT:
 983                                  *      this acts as a throttling mechanism
 984                                  *      for data_requests to the default
 985                                  *      pager.
 986                                  */
 987
 988 #if TRACEFAULTPAGE
 989                                 dbgTrace(0xBEEF000F, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 990 #endif
 991                                 if (m->fictitious && !vm_page_convert(m)) {
 992                                         VM_PAGE_FREE(m);
 993                                         vm_fault_cleanup(object, first_m);
 994                                         thread_interrupt_level(interruptible_state);
 995                                         return(VM_FAULT_MEMORY_SHORTAGE);
 996                                 }
 997                         } else if (object->absent_count >
 998                                                 vm_object_absent_max) {
 999                                 /*
1000                                  *      If there are too many outstanding page
1001                                  *      requests pending on this object, we
1002                                  *      wait for them to be resolved now.
1003                                  */
1004
1005 #if TRACEFAULTPAGE
1006                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1007 #endif
1008                                 if(m != VM_PAGE_NULL)
1009                                         VM_PAGE_FREE(m);
1010                                 /* take an extra ref so object won't die */
1011                                 assert(object->ref_count > 0);
1012                                 object->ref_count++;
1013                                 vm_object_res_reference(object);
1014                                 vm_fault_cleanup(object, first_m);
1015                                 counter(c_vm_fault_page_block_backoff_kernel++);
1016                                 vm_object_lock(object);
1017                                 assert(object->ref_count > 0);
1018                                 if (object->absent_count > vm_object_absent_max) {
1019                                         vm_object_absent_assert_wait(object,
1020                                                                      interruptible);
1021                                         vm_object_unlock(object);
1022                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1023                                         vm_object_deallocate(object);
1024                                         goto backoff;
1025                                 } else {
1026                                         vm_object_unlock(object);
1027                                         vm_object_deallocate(object);
1028                                         thread_interrupt_level(interruptible_state);
1029                                         return VM_FAULT_RETRY;
1030                                 }
1031                         }
1032
1033                         /*
1034                          *      Indicate that the page is waiting for data
1035                          *      from the memory manager.
1036                          */
1037
1038                         if(m != VM_PAGE_NULL) {
1039
1040                                 m->list_req_pending = TRUE;
1041                                 m->absent = TRUE;
1042                                 m->unusual = TRUE;
1043                                 object->absent_count++;
1044
1045                         }
1046
1047 no_clustering:
1048                         cluster_start = offset;
1049                         length = PAGE_SIZE;
1050
1051                         /*
1052                          * lengthen the cluster by the pages in the working set
1053                          */
1054                         if((map != NULL) &&
1055                                 (current_task()->dynamic_working_set != 0)) {
1056                                 cluster_end = cluster_start + length;
1057                                 /* tws values for start and end are just a
1058                                  * suggestions.  Therefore, as long as
1059                                  * build_cluster does not use pointers or
1060                                  * take action based on values that
1061                                  * could be affected by re-entrance we
1062                                  * do not need to take the map lock.
1063                                  */
1064                                 cluster_end = offset + PAGE_SIZE_64;
1065                                 tws_build_cluster((tws_hash_t)
1066                                         current_task()->dynamic_working_set,
1067                                         object, &cluster_start,
1068                                         &cluster_end, 0x40000);
1069                                 length = cluster_end - cluster_start;
1070                         }
1071 #if TRACEFAULTPAGE
1072                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1073 #endif
1074                         /*
1075                          *      We have a busy page, so we can
1076                          *      release the object lock.
1077                          */
1078                         vm_object_unlock(object);
1079
1080                         /*
1081                          *      Call the memory manager to retrieve the data.
1082                          */
1083
1084                         if (type_of_fault)
1085                                 *type_of_fault = (length << 8) | DBG_PAGEIN_FAULT;
1086                         VM_STAT(pageins++);
1087                         current_task()->pageins++;
1088                         bumped_pagein = TRUE;
1089
1090                         /*
1091                          *      If this object uses a copy_call strategy,
1092                          *      and we are interested in a copy of this object
1093                          *      (having gotten here only by following a
1094                          *      shadow chain), then tell the memory manager
1095                          *      via a flag added to the desired_access
1096                          *      parameter, so that it can detect a race
1097                          *      between our walking down the shadow chain
1098                          *      and its pushing pages up into a copy of
1099                          *      the object that it manages.
1100                          */
1101
1102                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL &&
1103                             object != first_object) {
1104                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1105                         } else {
1106                                 wants_copy_flag = VM_PROT_NONE;
1107                         }
1108
1109                         XPR(XPR_VM_FAULT,
1110                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1111                                 (integer_t)object, offset, (integer_t)m,
1112                                 access_required | wants_copy_flag, 0);
1113
1114                         rc = memory_object_data_request(object->pager,
1115                                         cluster_start + object->paging_offset,
1116                                         length,
1117                                         access_required | wants_copy_flag);
1118
1119
1120 #if TRACEFAULTPAGE
1121                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1122 #endif
1123                         if (rc != KERN_SUCCESS) {
1124                                 if (rc != MACH_SEND_INTERRUPTED
1125                                     && vm_fault_debug)
1126                                         printf("%s(0x%x, 0x%x, 0x%x, 0x%x) failed, rc=%d\n",
1127                                                 "memory_object_data_request",
1128                                                 object->pager,
1129                                                 cluster_start + object->paging_offset,
1130                                                 length, access_required, rc);
1131                                 /*
1132                                  *      Don't want to leave a busy page around,
1133                                  *      but the data request may have blocked,
1134                                  *      so check if it's still there and busy.
1135                                  */
1136                                 if(!object->phys_contiguous) {
1137                                    vm_object_lock(object);
1138                                    for (; length; length -= PAGE_SIZE,
1139                                       cluster_start += PAGE_SIZE_64) {
1140                                       vm_page_t p;
1141                                       if ((p = vm_page_lookup(object,
1142                                                                 cluster_start))
1143                                             && p->absent && p->busy
1144                                             && p != first_m) {
1145                                          VM_PAGE_FREE(p);
1146                                       }
1147                                    }
1148                                 }
1149                                 vm_fault_cleanup(object, first_m);
1150                                 thread_interrupt_level(interruptible_state);
1151                                 return((rc == MACH_SEND_INTERRUPTED) ?
1152                                         VM_FAULT_INTERRUPTED :
1153                                         VM_FAULT_MEMORY_ERROR);
1154                         } else {
1155 #ifdef notdefcdy
1156                                 tws_hash_line_t line;
1157                                 task_t          task;
1158
1159                                 task = current_task();
1160
1161                                 if((map != NULL) &&
1162                                         (task->dynamic_working_set != 0))
1163                                                 && !(object->private)) {
1164                                         vm_object_t     base_object;
1165                                         vm_object_offset_t base_offset;
1166                                         base_object = object;
1167                                         base_offset = offset;
1168                                         while(base_object->shadow) {
1169                                                 base_offset +=
1170                                                   base_object->shadow_offset;
1171                                                 base_object =
1172                                                   base_object->shadow;
1173                                         }
1174                                         if(tws_lookup
1175                                                 ((tws_hash_t)
1176                                                 task->dynamic_working_set,
1177                                                 base_offset, base_object,
1178                                                 &line) == KERN_SUCCESS) {
1179                                                 tws_line_signal((tws_hash_t)
1180                                                 task->dynamic_working_set,
1181                                                         map, line, vaddr);
1182                                         }
1183                                 }
1184 #endif
1185                         }
1186
1187                         /*
1188                          * Retry with same object/offset, since new data may
1189                          * be in a different page (i.e., m is meaningless at
1190                          * this point).
1191                          */
1192                         vm_object_lock(object);
1193                         if ((interruptible != THREAD_UNINT) &&
1194                             (current_thread()->state & TH_ABORT)) {
1195                                 vm_fault_cleanup(object, first_m);
1196                                 thread_interrupt_level(interruptible_state);
1197                                 return(VM_FAULT_INTERRUPTED);
1198                         }
1199                         if(m == VM_PAGE_NULL)
1200                                 break;
1201                         continue;
1202                 }
1203
1204                 /*
1205                  * The only case in which we get here is if
1206                  * object has no pager (or unwiring).  If the pager doesn't
1207                  * have the page this is handled in the m->absent case above
1208                  * (and if you change things here you should look above).
1209                  */
1210 #if TRACEFAULTPAGE
1211                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1212 #endif
1213                 if (object == first_object)
1214                         first_m = m;
1215                 else
1216                         assert(m == VM_PAGE_NULL);
1217
1218                 XPR(XPR_VM_FAULT,
1219                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1220                         (integer_t)object, offset, (integer_t)m,
1221                         (integer_t)object->shadow, 0);
1222                 /*
1223                  *      Move on to the next object.  Lock the next
1224                  *      object before unlocking the current one.
1225                  */
1226                 next_object = object->shadow;
1227                 if (next_object == VM_OBJECT_NULL) {
1228                         assert(!must_be_resident);
1229                         /*
1230                          *      If there's no object left, fill the page
1231                          *      in the top object with zeros.  But first we
1232                          *      need to allocate a real page.
1233                          */
1234
1235                         if (object != first_object) {
1236                                 vm_object_paging_end(object);
1237                                 vm_object_unlock(object);
1238
1239                                 object = first_object;
1240                                 offset = first_offset;
1241                                 vm_object_lock(object);
1242                         }
1243
1244                         m = first_m;
1245                         assert(m->object == object);
1246                         first_m = VM_PAGE_NULL;
1247
1248                         if (object->shadow_severed) {
1249                                 VM_PAGE_FREE(m);
1250                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1251                                 thread_interrupt_level(interruptible_state);
1252                                 return VM_FAULT_MEMORY_ERROR;
1253                         }
1254
1255                         if (VM_PAGE_THROTTLED() ||
1256                             (m->fictitious && !vm_page_convert(m))) {
1257                                 VM_PAGE_FREE(m);
1258                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1259                                 thread_interrupt_level(interruptible_state);
1260                                 return(VM_FAULT_MEMORY_SHORTAGE);
1261                         }
1262                         m->no_isync = FALSE;
1263
1264                         if (!no_zero_fill) {
1265                                 vm_object_unlock(object);
1266                                 vm_page_zero_fill(m);
1267                                 if (type_of_fault)
1268                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
1269                                 VM_STAT(zero_fill_count++);
1270
1271                                 if (bumped_pagein == TRUE) {
1272                                         VM_STAT(pageins--);
1273                                         current_task()->pageins--;
1274                                 }
1275                                 vm_object_lock(object);
1276                         }
1277                         vm_page_lock_queues();
1278                         VM_PAGE_QUEUES_REMOVE(m);
1279                         if(m->object->size > 0x80000) {
1280                                 m->zero_fill = TRUE;
1281                                 /* depends on the queues lock */
1282                                 vm_zf_count += 1;
1283                                 queue_enter(&vm_page_queue_zf,
1284                                         m, vm_page_t, pageq);
1285                         } else {
1286                                 queue_enter(
1287                                         &vm_page_queue_inactive,
1288                                         m, vm_page_t, pageq);
1289                         }
1290                         m->page_ticket = vm_page_ticket;
1291                         vm_page_ticket_roll++;
1292                         if(vm_page_ticket_roll == VM_PAGE_TICKETS_IN_ROLL) {
1293                                 vm_page_ticket_roll = 0;
1294                                 if(vm_page_ticket ==
1295                                         VM_PAGE_TICKET_ROLL_IDS)
1296                                         vm_page_ticket= 0;
1297                                 else
1298                                         vm_page_ticket++;
1299                         }
1300                         m->inactive = TRUE;
1301                         vm_page_inactive_count++;
1302                         vm_page_unlock_queues();
1303                         pmap_clear_modify(m->phys_addr);
1304                         break;
1305                 }
1306                 else {
1307                         if ((object != first_object) || must_be_resident)
1308                                 vm_object_paging_end(object);
1309                         offset += object->shadow_offset;
1310                         hi_offset += object->shadow_offset;
1311                         lo_offset += object->shadow_offset;
1312                         access_required = VM_PROT_READ;
1313                         vm_object_lock(next_object);
1314                         vm_object_unlock(object);
1315                         object = next_object;
1316                         vm_object_paging_begin(object);
1317                 }
1318         }
1319
1320         /*
1321          *      PAGE HAS BEEN FOUND.
1322          *
1323          *      This page (m) is:
1324          *              busy, so that we can play with it;
1325          *              not absent, so that nobody else will fill it;
1326          *              possibly eligible for pageout;
1327          *
1328          *      The top-level page (first_m) is:
1329          *              VM_PAGE_NULL if the page was found in the
1330          *               top-level object;
1331          *              busy, not absent, and ineligible for pageout.
1332          *
1333          *      The current object (object) is locked.  A paging
1334          *      reference is held for the current and top-level
1335          *      objects.
1336          */
1337
1338 #if TRACEFAULTPAGE
1339         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1340 #endif
1341 #if     EXTRA_ASSERTIONS
1342         if(m != VM_PAGE_NULL) {
1343                 assert(m->busy && !m->absent);
1344                 assert((first_m == VM_PAGE_NULL) ||
1345                         (first_m->busy && !first_m->absent &&
1346                          !first_m->active && !first_m->inactive));
1347         }
1348 #endif  /* EXTRA_ASSERTIONS */
1349
1350         XPR(XPR_VM_FAULT,
1351        "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1352                 (integer_t)object, offset, (integer_t)m,
1353                 (integer_t)first_object, (integer_t)first_m);
1354         /*
1355          *      If the page is being written, but isn't
1356          *      already owned by the top-level object,
1357          *      we have to copy it into a new page owned
1358          *      by the top-level object.
1359          */
1360
1361         if ((object != first_object) && (m != VM_PAGE_NULL)) {
1362                 /*
1363                  *      We only really need to copy if we
1364                  *      want to write it.
1365                  */
1366
1367 #if TRACEFAULTPAGE
1368                         dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1369 #endif
1370                 if (fault_type & VM_PROT_WRITE) {
1371                         vm_page_t copy_m;
1372
1373                         assert(!must_be_resident);
1374
1375                         /*
1376                          *      If we try to collapse first_object at this
1377                          *      point, we may deadlock when we try to get
1378                          *      the lock on an intermediate object (since we
1379                          *      have the bottom object locked).  We can't
1380                          *      unlock the bottom object, because the page
1381                          *      we found may move (by collapse) if we do.
1382                          *
1383                          *      Instead, we first copy the page.  Then, when
1384                          *      we have no more use for the bottom object,
1385                          *      we unlock it and try to collapse.
1386                          *
1387                          *      Note that we copy the page even if we didn't
1388                          *      need to... that's the breaks.
1389                          */
1390
1391                         /*
1392                          *      Allocate a page for the copy
1393                          */
1394                         copy_m = vm_page_grab();
1395                         if (copy_m == VM_PAGE_NULL) {
1396                                 RELEASE_PAGE(m);
1397                                 vm_fault_cleanup(object, first_m);
1398                                 thread_interrupt_level(interruptible_state);
1399                                 return(VM_FAULT_MEMORY_SHORTAGE);
1400                         }
1401
1402
1403                         XPR(XPR_VM_FAULT,
1404                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1405                                 (integer_t)object, offset,
1406                                 (integer_t)m, (integer_t)copy_m, 0);
1407                         vm_page_copy(m, copy_m);
1408
1409                         /*
1410                          *      If another map is truly sharing this
1411                          *      page with us, we have to flush all
1412                          *      uses of the original page, since we
1413                          *      can't distinguish those which want the
1414                          *      original from those which need the
1415                          *      new copy.
1416                          *
1417                          *      XXXO If we know that only one map has
1418                          *      access to this page, then we could
1419                          *      avoid the pmap_page_protect() call.
1420                          */
1421
1422                         vm_page_lock_queues();
1423                         assert(!m->cleaning);
1424                         pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1425                         vm_page_deactivate(m);
1426                         copy_m->dirty = TRUE;
1427                         /*
1428                          * Setting reference here prevents this fault from
1429                          * being counted as a (per-thread) reactivate as well
1430                          * as a copy-on-write.
1431                          */
1432                         first_m->reference = TRUE;
1433                         vm_page_unlock_queues();
1434
1435                         /*
1436                          *      We no longer need the old page or object.
1437                          */
1438
1439                         PAGE_WAKEUP_DONE(m);
1440                         vm_object_paging_end(object);
1441                         vm_object_unlock(object);
1442
1443                         if (type_of_fault)
1444                                 *type_of_fault = DBG_COW_FAULT;
1445                         VM_STAT(cow_faults++);
1446                         current_task()->cow_faults++;
1447                         object = first_object;
1448                         offset = first_offset;
1449
1450                         vm_object_lock(object);
1451                         VM_PAGE_FREE(first_m);
1452                         first_m = VM_PAGE_NULL;
1453                         assert(copy_m->busy);
1454                         vm_page_insert(copy_m, object, offset);
1455                         m = copy_m;
1456
1457                         /*
1458                          *      Now that we've gotten the copy out of the
1459                          *      way, let's try to collapse the top object.
1460                          *      But we have to play ugly games with
1461                          *      paging_in_progress to do that...
1462                          */
1463
1464                         vm_object_paging_end(object);
1465                         vm_object_collapse(object);
1466                         vm_object_paging_begin(object);
1467
1468                 }
1469                 else {
1470                         *protection &= (~VM_PROT_WRITE);
1471                 }
1472         }
1473
1474         /*
1475          *      Now check whether the page needs to be pushed into the
1476          *      copy object.  The use of asymmetric copy on write for
1477          *      shared temporary objects means that we may do two copies to
1478          *      satisfy the fault; one above to get the page from a
1479          *      shadowed object, and one here to push it into the copy.
1480          */
1481
1482         while ((copy_object = first_object->copy) != VM_OBJECT_NULL &&
1483                    (m!= VM_PAGE_NULL)) {
1484                 vm_object_offset_t      copy_offset;
1485                 vm_page_t               copy_m;
1486
1487 #if TRACEFAULTPAGE
1488                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1489 #endif
1490                 /*
1491                  *      If the page is being written, but hasn't been
1492                  *      copied to the copy-object, we have to copy it there.
1493                  */
1494
1495                 if ((fault_type & VM_PROT_WRITE) == 0) {
1496                         *protection &= ~VM_PROT_WRITE;
1497                         break;
1498                 }
1499
1500                 /*
1501                  *      If the page was guaranteed to be resident,
1502                  *      we must have already performed the copy.
1503                  */
1504
1505                 if (must_be_resident)
1506                         break;
1507
1508                 /*
1509                  *      Try to get the lock on the copy_object.
1510                  */
1511                 if (!vm_object_lock_try(copy_object)) {
1512                         vm_object_unlock(object);
1513
1514                         mutex_pause();  /* wait a bit */
1515
1516                         vm_object_lock(object);
1517                         continue;
1518                 }
1519
1520                 /*
1521                  *      Make another reference to the copy-object,
1522                  *      to keep it from disappearing during the
1523                  *      copy.
1524                  */
1525                 assert(copy_object->ref_count > 0);
1526                 copy_object->ref_count++;
1527                 VM_OBJ_RES_INCR(copy_object);
1528
1529                 /*
1530                  *      Does the page exist in the copy?
1531                  */
1532                 copy_offset = first_offset - copy_object->shadow_offset;
1533                 if (copy_object->size <= copy_offset)
1534                         /*
1535                          * Copy object doesn't cover this page -- do nothing.
1536                          */
1537                         ;
1538                 else if ((copy_m =
1539                         vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1540                         /* Page currently exists in the copy object */
1541                         if (copy_m->busy) {
1542                                 /*
1543                                  *      If the page is being brought
1544                                  *      in, wait for it and then retry.
1545                                  */
1546                                 RELEASE_PAGE(m);
1547                                 /* take an extra ref so object won't die */
1548                                 assert(copy_object->ref_count > 0);
1549                                 copy_object->ref_count++;
1550                                 vm_object_res_reference(copy_object);
1551                                 vm_object_unlock(copy_object);
1552                                 vm_fault_cleanup(object, first_m);
1553                                 counter(c_vm_fault_page_block_backoff_kernel++);
1554                                 vm_object_lock(copy_object);
1555                                 assert(copy_object->ref_count > 0);
1556                                 VM_OBJ_RES_DECR(copy_object);
1557                                 copy_object->ref_count--;
1558                                 assert(copy_object->ref_count > 0);
1559                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1560                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1561                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1562                                         vm_object_unlock(copy_object);
1563                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1564                                         vm_object_deallocate(copy_object);
1565                                         goto backoff;
1566                                 } else {
1567                                         vm_object_unlock(copy_object);
1568                                         vm_object_deallocate(copy_object);
1569                                         thread_interrupt_level(interruptible_state);
1570                                         return VM_FAULT_RETRY;
1571                                 }
1572                         }
1573                 }
1574                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1575                         /*
1576                          * If PAGED_OUT is TRUE, then the page used to exist
1577                          * in the copy-object, and has already been paged out.
1578                          * We don't need to repeat this. If PAGED_OUT is
1579                          * FALSE, then either we don't know (!pager_created,
1580                          * for example) or it hasn't been paged out.
1581                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1582                          * We must copy the page to the copy object.
1583                          */
1584
1585                         /*
1586                          *      Allocate a page for the copy
1587                          */
1588                         copy_m = vm_page_alloc(copy_object, copy_offset);
1589                         if (copy_m == VM_PAGE_NULL) {
1590                                 RELEASE_PAGE(m);
1591                                 VM_OBJ_RES_DECR(copy_object);
1592                                 copy_object->ref_count--;
1593                                 assert(copy_object->ref_count > 0);
1594                                 vm_object_unlock(copy_object);
1595                                 vm_fault_cleanup(object, first_m);
1596                                 thread_interrupt_level(interruptible_state);
1597                                 return(VM_FAULT_MEMORY_SHORTAGE);
1598                         }
1599
1600                         /*
1601                          *      Must copy page into copy-object.
1602                          */
1603
1604                         vm_page_copy(m, copy_m);
1605
1606                         /*
1607                          *      If the old page was in use by any users
1608                          *      of the copy-object, it must be removed
1609                          *      from all pmaps.  (We can't know which
1610                          *      pmaps use it.)
1611                          */
1612
1613                         vm_page_lock_queues();
1614                         assert(!m->cleaning);
1615                         pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1616                         copy_m->dirty = TRUE;
1617                         vm_page_unlock_queues();
1618
1619                         /*
1620                          *      If there's a pager, then immediately
1621                          *      page out this page, using the "initialize"
1622                          *      option.  Else, we use the copy.
1623                          */
1624
1625                         if
1626 #if     MACH_PAGEMAP
1627                           ((!copy_object->pager_created) ||
1628                                 vm_external_state_get(
1629                                         copy_object->existence_map, copy_offset)
1630                                 == VM_EXTERNAL_STATE_ABSENT)
1631 #else
1632                           (!copy_object->pager_created)
1633 #endif
1634                                 {
1635                                 vm_page_lock_queues();
1636                                 vm_page_activate(copy_m);
1637                                 vm_page_unlock_queues();
1638                                 PAGE_WAKEUP_DONE(copy_m);
1639                         }
1640                         else {
1641                                 assert(copy_m->busy == TRUE);
1642
1643                                 /*
1644                                  *      The page is already ready for pageout:
1645                                  *      not on pageout queues and busy.
1646                                  *      Unlock everything except the
1647                                  *      copy_object itself.
1648                                  */
1649
1650                                 vm_object_unlock(object);
1651
1652                                 /*
1653                                  *      Write the page to the copy-object,
1654                                  *      flushing it from the kernel.
1655                                  */
1656
1657                                 vm_pageout_initialize_page(copy_m);
1658
1659                                 /*
1660                                  *      Since the pageout may have
1661                                  *      temporarily dropped the
1662                                  *      copy_object's lock, we
1663                                  *      check whether we'll have
1664                                  *      to deallocate the hard way.
1665                                  */
1666
1667                                 if ((copy_object->shadow != object) ||
1668                                     (copy_object->ref_count == 1)) {
1669                                         vm_object_unlock(copy_object);
1670                                         vm_object_deallocate(copy_object);
1671                                         vm_object_lock(object);
1672                                         continue;
1673                                 }
1674
1675                                 /*
1676                                  *      Pick back up the old object's
1677                                  *      lock.  [It is safe to do so,
1678                                  *      since it must be deeper in the
1679                                  *      object tree.]
1680                                  */
1681
1682                                 vm_object_lock(object);
1683                         }
1684
1685                         /*
1686                          *      Because we're pushing a page upward
1687                          *      in the object tree, we must restart
1688                          *      any faults that are waiting here.
1689                          *      [Note that this is an expansion of
1690                          *      PAGE_WAKEUP that uses the THREAD_RESTART
1691                          *      wait result].  Can't turn off the page's
1692                          *      busy bit because we're not done with it.
1693                          */
1694
1695                         if (m->wanted) {
1696                                 m->wanted = FALSE;
1697                                 thread_wakeup_with_result((event_t) m,
1698                                         THREAD_RESTART);
1699                         }
1700                 }
1701
1702                 /*
1703                  *      The reference count on copy_object must be
1704                  *      at least 2: one for our extra reference,
1705                  *      and at least one from the outside world
1706                  *      (we checked that when we last locked
1707                  *      copy_object).
1708                  */
1709                 copy_object->ref_count--;
1710                 assert(copy_object->ref_count > 0);
1711                 VM_OBJ_RES_DECR(copy_object);
1712                 vm_object_unlock(copy_object);
1713
1714                 break;
1715         }
1716
1717         *result_page = m;
1718         *top_page = first_m;
1719
1720         XPR(XPR_VM_FAULT,
1721                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1722                 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1723         /*
1724          *      If the page can be written, assume that it will be.
1725          *      [Earlier, we restrict the permission to allow write
1726          *      access only if the fault so required, so we don't
1727          *      mark read-only data as dirty.]
1728          */
1729
1730 #if     !VM_FAULT_STATIC_CONFIG
1731         if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE) &&
1732                         (m != VM_PAGE_NULL)) {
1733                 m->dirty = TRUE;
1734         }
1735 #endif
1736 #if TRACEFAULTPAGE
1737         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_page_deactivate_behind);  /* (TEST/DEBUG) */
1738 #endif
1739         if (vm_page_deactivate_behind) {
1740                 if (offset && /* don't underflow */
1741                         (object->last_alloc == (offset - PAGE_SIZE_64))) {
1742                         m = vm_page_lookup(object, object->last_alloc);
1743                         if ((m != VM_PAGE_NULL) && !m->busy) {
1744                                 vm_page_lock_queues();
1745                                 vm_page_deactivate(m);
1746                                 vm_page_unlock_queues();
1747                         }
1748 #if TRACEFAULTPAGE
1749                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1750 #endif
1751                 }
1752                 object->last_alloc = offset;
1753         }
1754 #if TRACEFAULTPAGE
1755         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
1756 #endif
1757         thread_interrupt_level(interruptible_state);
1758         if(*result_page == VM_PAGE_NULL) {
1759                 vm_object_unlock(object);
1760         }
1761         return(VM_FAULT_SUCCESS);
1762
1763 #if 0
1764     block_and_backoff:
1765         vm_fault_cleanup(object, first_m);
1766
1767         counter(c_vm_fault_page_block_backoff_kernel++);
1768         thread_block(THREAD_CONTINUE_NULL);
1769 #endif
1770
1771     backoff:
1772         thread_interrupt_level(interruptible_state);
1773         if (wait_result == THREAD_INTERRUPTED)
1774                 return VM_FAULT_INTERRUPTED;
1775         return VM_FAULT_RETRY;
1776
1777 #undef  RELEASE_PAGE
1778 }
1779
1780 /*
1781  *      Routine:        vm_fault
1782  *      Purpose:
1783  *              Handle page faults, including pseudo-faults
1784  *              used to change the wiring status of pages.
1785  *      Returns:
1786  *              Explicit continuations have been removed.
1787  *      Implementation:
1788  *              vm_fault and vm_fault_page save mucho state
1789  *              in the moral equivalent of a closure.  The state
1790  *              structure is allocated when first entering vm_fault
1791  *              and deallocated when leaving vm_fault.
1792  */
1793
1794 kern_return_t
1795 vm_fault(
1796         vm_map_t        map,
1797         vm_offset_t     vaddr,
1798         vm_prot_t       fault_type,
1799         boolean_t       change_wiring,
1800         int             interruptible,
1801         pmap_t          caller_pmap,
1802         vm_offset_t     caller_pmap_addr)
1803 {
1804         vm_map_version_t        version;        /* Map version for verificiation */
1805         boolean_t               wired;          /* Should mapping be wired down? */
1806         vm_object_t             object;         /* Top-level object */
1807         vm_object_offset_t      offset;         /* Top-level offset */
1808         vm_prot_t               prot;           /* Protection for mapping */
1809         vm_behavior_t           behavior;       /* Expected paging behavior */
1810         vm_object_offset_t      lo_offset, hi_offset;
1811         vm_object_t             old_copy_object; /* Saved copy object */
1812         vm_page_t               result_page;    /* Result of vm_fault_page */
1813         vm_page_t               top_page;       /* Placeholder page */
1814         kern_return_t           kr;
1815
1816         register
1817         vm_page_t               m;      /* Fast access to result_page */
1818         kern_return_t           error_code;     /* page error reasons */
1819         register
1820         vm_object_t             cur_object;
1821         register
1822         vm_object_offset_t      cur_offset;
1823         vm_page_t               cur_m;
1824         vm_object_t             new_object;
1825         int                     type_of_fault;
1826         vm_map_t                pmap_map = map;
1827         vm_map_t                original_map = map;
1828         pmap_t                  pmap = NULL;
1829         boolean_t               funnel_set = FALSE;
1830         funnel_t                *curflock;
1831         thread_t                cur_thread;
1832         boolean_t               interruptible_state;
1833         unsigned int            cache_attr;
1834         int                     write_startup_file = 0;
1835         vm_prot_t               full_fault_type;
1836
1837
1838
1839         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_START,
1840                               vaddr,
1841                               0,
1842                               0,
1843                               0,
1844                               0);
1845
1846         cur_thread = current_thread();
1847         /* at present we do not fully check for execute permission */
1848         /* we generally treat it is read except in certain device  */
1849         /* memory settings */
1850         full_fault_type = fault_type;
1851         if(fault_type & VM_PROT_EXECUTE) {
1852                 fault_type &= ~VM_PROT_EXECUTE;
1853                 fault_type |= VM_PROT_READ;
1854         }
1855
1856         interruptible_state = thread_interrupt_level(interruptible);
1857
1858         /*
1859          * assume we will hit a page in the cache
1860          * otherwise, explicitly override with
1861          * the real fault type once we determine it
1862          */
1863         type_of_fault = DBG_CACHE_HIT_FAULT;
1864
1865         VM_STAT(faults++);
1866         current_task()->faults++;
1867
1868         /*
1869          * drop funnel if it is already held. Then restore while returning
1870          */
1871         if ((cur_thread->funnel_state & TH_FN_OWNED) == TH_FN_OWNED) {
1872                 funnel_set = TRUE;
1873                 curflock = cur_thread->funnel_lock;
1874                 thread_funnel_set( curflock , FALSE);
1875         }
1876
1877     RetryFault: ;
1878
1879         /*
1880          *      Find the backing store object and offset into
1881          *      it to begin the search.
1882          */
1883         map = original_map;
1884         vm_map_lock_read(map);
1885         kr = vm_map_lookup_locked(&map, vaddr, fault_type, &version,
1886                                 &object, &offset,
1887                                 &prot, &wired,
1888                                 &behavior, &lo_offset, &hi_offset, &pmap_map);
1889
1890         pmap = pmap_map->pmap;
1891
1892         if (kr != KERN_SUCCESS) {
1893                 vm_map_unlock_read(map);
1894                 goto done;
1895         }
1896
1897         /*
1898          *      If the page is wired, we must fault for the current protection
1899          *      value, to avoid further faults.
1900          */
1901
1902         if (wired)
1903                 fault_type = prot | VM_PROT_WRITE;
1904
1905 #if     VM_FAULT_CLASSIFY
1906         /*
1907          *      Temporary data gathering code
1908          */
1909         vm_fault_classify(object, offset, fault_type);
1910 #endif
1911         /*
1912          *      Fast fault code.  The basic idea is to do as much as
1913          *      possible while holding the map lock and object locks.
1914          *      Busy pages are not used until the object lock has to
1915          *      be dropped to do something (copy, zero fill, pmap enter).
1916          *      Similarly, paging references aren't acquired until that
1917          *      point, and object references aren't used.
1918          *
1919          *      If we can figure out what to do
1920          *      (zero fill, copy on write, pmap enter) while holding
1921          *      the locks, then it gets done.  Otherwise, we give up,
1922          *      and use the original fault path (which doesn't hold
1923          *      the map lock, and relies on busy pages).
1924          *      The give up cases include:
1925          *              - Have to talk to pager.
1926          *              - Page is busy, absent or in error.
1927          *              - Pager has locked out desired access.
1928          *              - Fault needs to be restarted.
1929          *              - Have to push page into copy object.
1930          *
1931          *      The code is an infinite loop that moves one level down
1932          *      the shadow chain each time.  cur_object and cur_offset
1933          *      refer to the current object being examined. object and offset
1934          *      are the original object from the map.  The loop is at the
1935          *      top level if and only if object and cur_object are the same.
1936          *
1937          *      Invariants:  Map lock is held throughout.  Lock is held on
1938          *              original object and cur_object (if different) when
1939          *              continuing or exiting loop.
1940          *
1941          */
1942
1943
1944         /*
1945          *      If this page is to be inserted in a copy delay object
1946          *      for writing, and if the object has a copy, then the
1947          *      copy delay strategy is implemented in the slow fault page.
1948          */
1949         if (object->copy_strategy != MEMORY_OBJECT_COPY_DELAY ||
1950             object->copy == VM_OBJECT_NULL ||
1951             (fault_type & VM_PROT_WRITE) == 0) {
1952         cur_object = object;
1953         cur_offset = offset;
1954
1955         while (TRUE) {
1956                 m = vm_page_lookup(cur_object, cur_offset);
1957                 if (m != VM_PAGE_NULL) {
1958                         if (m->busy) {
1959                                 wait_result_t   result;
1960
1961                                 if (object != cur_object)
1962                                         vm_object_unlock(object);
1963
1964                                 vm_map_unlock_read(map);
1965                                 if (pmap_map != map)
1966                                         vm_map_unlock(pmap_map);
1967
1968 #if     !VM_FAULT_STATIC_CONFIG
1969                                 if (!vm_fault_interruptible)
1970                                         interruptible = THREAD_UNINT;
1971 #endif
1972                                 result = PAGE_ASSERT_WAIT(m, interruptible);
1973
1974                                 vm_object_unlock(cur_object);
1975
1976                                 if (result == THREAD_WAITING) {
1977                                         result = thread_block(THREAD_CONTINUE_NULL);
1978
1979                                         counter(c_vm_fault_page_block_busy_kernel++);
1980                                 }
1981                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
1982                                         goto RetryFault;
1983
1984                                 kr = KERN_ABORTED;
1985                                 goto done;
1986                         }
1987                         if (m->unusual && (m->error || m->restart || m->private
1988                             || m->absent || (fault_type & m->page_lock))) {
1989
1990                                 /*
1991                                  *      Unusual case. Give up.
1992                                  */
1993                                 break;
1994                         }
1995
1996                         /*
1997                          *      Two cases of map in faults:
1998                          *          - At top level w/o copy object.
1999                          *          - Read fault anywhere.
2000                          *              --> must disallow write.
2001                          */
2002
2003                         if (object == cur_object &&
2004                             object->copy == VM_OBJECT_NULL)
2005                                 goto FastMapInFault;
2006
2007                         if ((fault_type & VM_PROT_WRITE) == 0) {
2008
2009                                 prot &= ~VM_PROT_WRITE;
2010
2011                                 /*
2012                                  *      Set up to map the page ...
2013                                  *      mark the page busy, drop
2014                                  *      locks and take a paging reference
2015                                  *      on the object with the page.
2016                                  */
2017
2018                                 if (object != cur_object) {
2019                                         vm_object_unlock(object);
2020                                         object = cur_object;
2021                                 }
2022 FastMapInFault:
2023                                 m->busy = TRUE;
2024
2025                                 vm_object_paging_begin(object);
2026
2027 FastPmapEnter:
2028                                 /*
2029                                  *      Check a couple of global reasons to
2030                                  *      be conservative about write access.
2031                                  *      Then do the pmap_enter.
2032                                  */
2033 #if     !VM_FAULT_STATIC_CONFIG
2034                                 if (vm_fault_dirty_handling
2035 #if     MACH_KDB
2036                                     || db_watchpoint_list
2037 #endif
2038                                     && (fault_type & VM_PROT_WRITE) == 0)
2039                                         prot &= ~VM_PROT_WRITE;
2040 #else   /* STATIC_CONFIG */
2041 #if     MACH_KDB
2042                                 if (db_watchpoint_list
2043                                     && (fault_type & VM_PROT_WRITE) == 0)
2044                                         prot &= ~VM_PROT_WRITE;
2045 #endif  /* MACH_KDB */
2046 #endif  /* STATIC_CONFIG */
2047                                 if (m->no_isync == TRUE) {
2048                                         pmap_sync_caches_phys(m->phys_addr);
2049                                         m->no_isync = FALSE;
2050                                 }
2051
2052                                 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2053                                 if(caller_pmap) {
2054                                         PMAP_ENTER(caller_pmap,
2055                                                 caller_pmap_addr, m,
2056                                                 prot, cache_attr, wired);
2057                                 } else {
2058                                         PMAP_ENTER(pmap, vaddr, m,
2059                                                 prot, cache_attr, wired);
2060                                 }
2061
2062                                 /*
2063                                  *      Grab the queues lock to manipulate
2064                                  *      the page queues.  Change wiring
2065                                  *      case is obvious.  In soft ref bits
2066                                  *      case activate page only if it fell
2067                                  *      off paging queues, otherwise just
2068                                  *      activate it if it's inactive.
2069                                  *
2070                                  *      NOTE: original vm_fault code will
2071                                  *      move active page to back of active
2072                                  *      queue.  This code doesn't.
2073                                  */
2074                                 vm_page_lock_queues();
2075
2076                                 if (m->clustered) {
2077                                         vm_pagein_cluster_used++;
2078                                         m->clustered = FALSE;
2079                                 }
2080                                 m->reference = TRUE;
2081
2082                                 if (change_wiring) {
2083                                         if (wired)
2084                                                 vm_page_wire(m);
2085                                         else
2086                                                 vm_page_unwire(m);
2087                                 }
2088 #if VM_FAULT_STATIC_CONFIG
2089                                 else {
2090                                         if (!m->active && !m->inactive)
2091                                                 vm_page_activate(m);
2092                                 }
2093 #else
2094                                 else if (software_reference_bits) {
2095                                         if (!m->active && !m->inactive)
2096                                                 vm_page_activate(m);
2097                                 }
2098                                 else if (!m->active) {
2099                                         vm_page_activate(m);
2100                                 }
2101 #endif
2102                                 vm_page_unlock_queues();
2103
2104                                 /*
2105                                  *      That's it, clean up and return.
2106                                  */
2107                                 PAGE_WAKEUP_DONE(m);
2108                                 vm_object_paging_end(object);
2109
2110                                 {
2111                                    tws_hash_line_t      line;
2112                                    task_t               task;
2113
2114                                    task = current_task();
2115                                    if((map != NULL) &&
2116                                         (task->dynamic_working_set != 0) &&
2117                                                 !(object->private)) {
2118                                         kern_return_t   kr;
2119                                         vm_object_t     base_object;
2120                                         vm_object_offset_t base_offset;
2121                                         base_object = object;
2122                                         base_offset = cur_offset;
2123                                         while(base_object->shadow) {
2124                                                 base_offset +=
2125                                                  base_object->shadow_offset;
2126                                                 base_object =
2127                                                  base_object->shadow;
2128                                         }
2129                                         kr = tws_lookup((tws_hash_t)
2130                                                 task->dynamic_working_set,
2131                                                 base_offset, base_object,
2132                                                 &line);
2133                                         if(kr == KERN_OPERATION_TIMED_OUT){
2134                                                 write_startup_file = 1;
2135                                         } else if (kr != KERN_SUCCESS) {
2136                                                 kr = tws_insert((tws_hash_t)
2137                                                    task->dynamic_working_set,
2138                                                    base_offset, base_object,
2139                                                    vaddr, pmap_map);
2140                                                 if(kr == KERN_NO_SPACE) {
2141                                                   vm_object_unlock(object);
2142
2143                                                    tws_expand_working_set(
2144                                                       task->dynamic_working_set,
2145                                                       TWS_HASH_LINE_COUNT,
2146                                                       FALSE);
2147
2148                                                    vm_object_lock(object);
2149                                                 }
2150                                                 if(kr ==
2151                                                    KERN_OPERATION_TIMED_OUT) {
2152                                                         write_startup_file = 1;
2153                                                 }
2154                                         }
2155                                    }
2156                                 }
2157                                 vm_object_unlock(object);
2158
2159                                 vm_map_unlock_read(map);
2160                                 if(pmap_map != map)
2161                                         vm_map_unlock(pmap_map);
2162
2163                                 if(write_startup_file)
2164                                         tws_send_startup_info(current_task());
2165
2166                                 if (funnel_set)
2167                                         thread_funnel_set( curflock, TRUE);
2168
2169                                 thread_interrupt_level(interruptible_state);
2170
2171
2172                                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2173                                                       vaddr,
2174                                                       type_of_fault & 0xff,
2175                                                       KERN_SUCCESS,
2176                                                       type_of_fault >> 8,
2177                                                       0);
2178
2179                                 return KERN_SUCCESS;
2180                         }
2181
2182                         /*
2183                          *      Copy on write fault.  If objects match, then
2184                          *      object->copy must not be NULL (else control
2185                          *      would be in previous code block), and we
2186                          *      have a potential push into the copy object
2187                          *      with which we won't cope here.
2188                          */
2189
2190                         if (cur_object == object)
2191                                 break;
2192                         /*
2193                          *      This is now a shadow based copy on write
2194                          *      fault -- it requires a copy up the shadow
2195                          *      chain.
2196                          *
2197                          *      Allocate a page in the original top level
2198                          *      object. Give up if allocate fails.  Also
2199                          *      need to remember current page, as it's the
2200                          *      source of the copy.
2201                          */
2202                         cur_m = m;
2203                         m = vm_page_grab();
2204                         if (m == VM_PAGE_NULL) {
2205                                 break;
2206                         }
2207                         /*
2208                          *      Now do the copy.  Mark the source busy
2209                          *      and take out paging references on both
2210                          *      objects.
2211                          *
2212                          *      NOTE: This code holds the map lock across
2213                          *      the page copy.
2214                          */
2215
2216                         cur_m->busy = TRUE;
2217                         vm_page_copy(cur_m, m);
2218                         vm_page_insert(m, object, offset);
2219
2220                         vm_object_paging_begin(cur_object);
2221                         vm_object_paging_begin(object);
2222
2223                         type_of_fault = DBG_COW_FAULT;
2224                         VM_STAT(cow_faults++);
2225                         current_task()->cow_faults++;
2226
2227                         /*
2228                          *      Now cope with the source page and object
2229                          *      If the top object has a ref count of 1
2230                          *      then no other map can access it, and hence
2231                          *      it's not necessary to do the pmap_page_protect.
2232                          */
2233
2234
2235                         vm_page_lock_queues();
2236                         vm_page_deactivate(cur_m);
2237                         m->dirty = TRUE;
2238                         pmap_page_protect(cur_m->phys_addr,
2239                                                   VM_PROT_NONE);
2240                         vm_page_unlock_queues();
2241
2242                         PAGE_WAKEUP_DONE(cur_m);
2243                         vm_object_paging_end(cur_object);
2244                         vm_object_unlock(cur_object);
2245
2246                         /*
2247                          *      Slight hack to call vm_object collapse
2248                          *      and then reuse common map in code.
2249                          *      note that the object lock was taken above.
2250                          */
2251
2252                         vm_object_paging_end(object);
2253                         vm_object_collapse(object);
2254                         vm_object_paging_begin(object);
2255
2256                         goto FastPmapEnter;
2257                 }
2258                 else {
2259
2260                         /*
2261                          *      No page at cur_object, cur_offset
2262                          */
2263
2264                         if (cur_object->pager_created) {
2265
2266                                 /*
2267                                  *      Have to talk to the pager.  Give up.
2268                                  */
2269                                 break;
2270                         }
2271
2272
2273                         if (cur_object->shadow == VM_OBJECT_NULL) {
2274
2275                                 if (cur_object->shadow_severed) {
2276                                         vm_object_paging_end(object);
2277                                         vm_object_unlock(object);
2278                                         vm_map_unlock_read(map);
2279                                         if(pmap_map != map)
2280                                                 vm_map_unlock(pmap_map);
2281
2282                                         if(write_startup_file)
2283                                                 tws_send_startup_info(
2284                                                                 current_task());
2285
2286                                         if (funnel_set) {
2287                                                 thread_funnel_set( curflock, TRUE);
2288                                                 funnel_set = FALSE;
2289                                         }
2290                                         thread_interrupt_level(interruptible_state);
2291
2292                                         return VM_FAULT_MEMORY_ERROR;
2293                                 }
2294
2295                                 /*
2296                                  *      Zero fill fault.  Page gets
2297                                  *      filled in top object. Insert
2298                                  *      page, then drop any lower lock.
2299                                  *      Give up if no page.
2300                                  */
2301                                 if ((vm_page_free_target -
2302                                    ((vm_page_free_target-vm_page_free_min)>>2))
2303                                                 > vm_page_free_count) {
2304                                         break;
2305                                 }
2306                                 m = vm_page_alloc(object, offset);
2307                                 if (m == VM_PAGE_NULL) {
2308                                         break;
2309                                 }
2310                                 /*
2311                                  * This is a zero-fill or initial fill
2312                                  * page fault.  As such, we consider it
2313                                  * undefined with respect to instruction
2314                                  * execution.  i.e. it is the responsibility
2315                                  * of higher layers to call for an instruction
2316                                  * sync after changing the contents and before
2317                                  * sending a program into this area.  We
2318                                  * choose this approach for performance
2319                                  */
2320
2321                                 m->no_isync = FALSE;
2322
2323                                 if (cur_object != object)
2324                                         vm_object_unlock(cur_object);
2325
2326                                 vm_object_paging_begin(object);
2327                                 vm_object_unlock(object);
2328
2329                                 /*
2330                                  *      Now zero fill page and map it.
2331                                  *      the page is probably going to
2332                                  *      be written soon, so don't bother
2333                                  *      to clear the modified bit
2334                                  *
2335                                  *      NOTE: This code holds the map
2336                                  *      lock across the zero fill.
2337                                  */
2338
2339                                 if (!map->no_zero_fill) {
2340                                         vm_page_zero_fill(m);
2341                                         type_of_fault = DBG_ZERO_FILL_FAULT;
2342                                         VM_STAT(zero_fill_count++);
2343                                 }
2344                                 vm_page_lock_queues();
2345                                 VM_PAGE_QUEUES_REMOVE(m);
2346
2347                                 m->page_ticket = vm_page_ticket;
2348                                 if(m->object->size > 0x80000) {
2349                                         m->zero_fill = TRUE;
2350                                         /* depends on the queues lock */
2351                                         vm_zf_count += 1;
2352                                         queue_enter(&vm_page_queue_zf,
2353                                                 m, vm_page_t, pageq);
2354                                 } else {
2355                                         queue_enter(
2356                                                 &vm_page_queue_inactive,
2357                                                 m, vm_page_t, pageq);
2358                                 }
2359                                 vm_page_ticket_roll++;
2360                                 if(vm_page_ticket_roll ==
2361                                                 VM_PAGE_TICKETS_IN_ROLL) {
2362                                         vm_page_ticket_roll = 0;
2363                                         if(vm_page_ticket ==
2364                                                 VM_PAGE_TICKET_ROLL_IDS)
2365                                                 vm_page_ticket= 0;
2366                                         else
2367                                                 vm_page_ticket++;
2368                                 }
2369
2370                                 m->inactive = TRUE;
2371                                 vm_page_inactive_count++;
2372                                 vm_page_unlock_queues();
2373                                 vm_object_lock(object);
2374
2375                                 goto FastPmapEnter;
2376                         }
2377
2378                         /*
2379                          *      On to the next level
2380                          */
2381
2382                         cur_offset += cur_object->shadow_offset;
2383                         new_object = cur_object->shadow;
2384                         vm_object_lock(new_object);
2385                         if (cur_object != object)
2386                                 vm_object_unlock(cur_object);
2387                         cur_object = new_object;
2388
2389                         continue;
2390                 }
2391         }
2392
2393         /*
2394          *      Cleanup from fast fault failure.  Drop any object
2395          *      lock other than original and drop map lock.
2396          */
2397
2398         if (object != cur_object)
2399                 vm_object_unlock(cur_object);
2400         }
2401         vm_map_unlock_read(map);
2402
2403         if(pmap_map != map)
2404                 vm_map_unlock(pmap_map);
2405
2406         /*
2407          *      Make a reference to this object to
2408          *      prevent its disposal while we are messing with
2409          *      it.  Once we have the reference, the map is free
2410          *      to be diddled.  Since objects reference their
2411          *      shadows (and copies), they will stay around as well.
2412          */
2413
2414         assert(object->ref_count > 0);
2415         object->ref_count++;
2416         vm_object_res_reference(object);
2417         vm_object_paging_begin(object);
2418
2419         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2420         {
2421                 tws_hash_line_t line;
2422                 task_t          task;
2423                 kern_return_t   kr;
2424
2425                    task = current_task();
2426                    if((map != NULL) &&
2427                         (task->dynamic_working_set != 0)
2428                                 && !(object->private)) {
2429                         vm_object_t     base_object;
2430                         vm_object_offset_t base_offset;
2431                         base_object = object;
2432                         base_offset = offset;
2433                         while(base_object->shadow) {
2434                                 base_offset +=
2435                                  base_object->shadow_offset;
2436                                 base_object =
2437                                  base_object->shadow;
2438                         }
2439                         kr = tws_lookup((tws_hash_t)
2440                                 task->dynamic_working_set,
2441                                 base_offset, base_object,
2442                                 &line);
2443                         if(kr == KERN_OPERATION_TIMED_OUT){
2444                                 write_startup_file = 1;
2445                         } else if (kr != KERN_SUCCESS) {
2446                                 tws_insert((tws_hash_t)
2447                                    task->dynamic_working_set,
2448                                    base_offset, base_object,
2449                                    vaddr, pmap_map);
2450                                 kr = tws_insert((tws_hash_t)
2451                                            task->dynamic_working_set,
2452                                            base_offset, base_object,
2453                                            vaddr, pmap_map);
2454                                 if(kr == KERN_NO_SPACE) {
2455                                         vm_object_unlock(object);
2456                                         tws_expand_working_set(
2457                                            task->dynamic_working_set,
2458                                            TWS_HASH_LINE_COUNT,
2459                                            FALSE);
2460                                         vm_object_lock(object);
2461                                 }
2462                                 if(kr == KERN_OPERATION_TIMED_OUT) {
2463                                         write_startup_file = 1;
2464                                 }
2465                         }
2466                 }
2467         }
2468         kr = vm_fault_page(object, offset, fault_type,
2469                            (change_wiring && !wired),
2470                            interruptible,
2471                            lo_offset, hi_offset, behavior,
2472                            &prot, &result_page, &top_page,
2473                            &type_of_fault,
2474                            &error_code, map->no_zero_fill, FALSE, map, vaddr);
2475
2476         /*
2477          *      If we didn't succeed, lose the object reference immediately.
2478          */
2479
2480         if (kr != VM_FAULT_SUCCESS)
2481                 vm_object_deallocate(object);
2482
2483         /*
2484          *      See why we failed, and take corrective action.
2485          */
2486
2487         switch (kr) {
2488                 case VM_FAULT_SUCCESS:
2489                         break;
2490                 case VM_FAULT_MEMORY_SHORTAGE:
2491                         if (vm_page_wait((change_wiring) ?
2492                                          THREAD_UNINT :
2493                                          THREAD_ABORTSAFE))
2494                                 goto RetryFault;
2495                         /* fall thru */
2496                 case VM_FAULT_INTERRUPTED:
2497                         kr = KERN_ABORTED;
2498                         goto done;
2499                 case VM_FAULT_RETRY:
2500                         goto RetryFault;
2501                 case VM_FAULT_FICTITIOUS_SHORTAGE:
2502                         vm_page_more_fictitious();
2503                         goto RetryFault;
2504                 case VM_FAULT_MEMORY_ERROR:
2505                         if (error_code)
2506                                 kr = error_code;
2507                         else
2508                                 kr = KERN_MEMORY_ERROR;
2509                         goto done;
2510         }
2511
2512         m = result_page;
2513
2514         if(m != VM_PAGE_NULL) {
2515                 assert((change_wiring && !wired) ?
2516                     (top_page == VM_PAGE_NULL) :
2517                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
2518         }
2519
2520         /*
2521          *      How to clean up the result of vm_fault_page.  This
2522          *      happens whether the mapping is entered or not.
2523          */
2524
2525 #define UNLOCK_AND_DEALLOCATE                           \
2526         MACRO_BEGIN                                     \
2527         vm_fault_cleanup(m->object, top_page);          \
2528         vm_object_deallocate(object);                   \
2529         MACRO_END
2530
2531         /*
2532          *      What to do with the resulting page from vm_fault_page
2533          *      if it doesn't get entered into the physical map:
2534          */
2535
2536 #define RELEASE_PAGE(m)                                 \
2537         MACRO_BEGIN                                     \
2538         PAGE_WAKEUP_DONE(m);                            \
2539         vm_page_lock_queues();                          \
2540         if (!m->active && !m->inactive)                 \
2541                 vm_page_activate(m);                    \
2542         vm_page_unlock_queues();                        \
2543         MACRO_END
2544
2545         /*
2546          *      We must verify that the maps have not changed
2547          *      since our last lookup.
2548          */
2549
2550         if(m != VM_PAGE_NULL) {
2551                 old_copy_object = m->object->copy;
2552                 vm_object_unlock(m->object);
2553         } else {
2554                 old_copy_object = VM_OBJECT_NULL;
2555         }
2556         if ((map != original_map) || !vm_map_verify(map, &version)) {
2557                 vm_object_t             retry_object;
2558                 vm_object_offset_t      retry_offset;
2559                 vm_prot_t               retry_prot;
2560
2561                 /*
2562                  *      To avoid trying to write_lock the map while another
2563                  *      thread has it read_locked (in vm_map_pageable), we
2564                  *      do not try for write permission.  If the page is
2565                  *      still writable, we will get write permission.  If it
2566                  *      is not, or has been marked needs_copy, we enter the
2567                  *      mapping without write permission, and will merely
2568                  *      take another fault.
2569                  */
2570                 map = original_map;
2571                 vm_map_lock_read(map);
2572                 kr = vm_map_lookup_locked(&map, vaddr,
2573                                    fault_type & ~VM_PROT_WRITE, &version,
2574                                    &retry_object, &retry_offset, &retry_prot,
2575                                    &wired, &behavior, &lo_offset, &hi_offset,
2576                                    &pmap_map);
2577                 pmap = pmap_map->pmap;
2578
2579                 if (kr != KERN_SUCCESS) {
2580                         vm_map_unlock_read(map);
2581                         if(m != VM_PAGE_NULL) {
2582                                 vm_object_lock(m->object);
2583                                 RELEASE_PAGE(m);
2584                                 UNLOCK_AND_DEALLOCATE;
2585                         } else {
2586                                 vm_object_deallocate(object);
2587                         }
2588                         goto done;
2589                 }
2590
2591                 vm_object_unlock(retry_object);
2592                 if(m != VM_PAGE_NULL) {
2593                         vm_object_lock(m->object);
2594                 } else {
2595                         vm_object_lock(object);
2596                 }
2597
2598                 if ((retry_object != object) ||
2599                     (retry_offset != offset)) {
2600                         vm_map_unlock_read(map);
2601                         if(pmap_map != map)
2602                                 vm_map_unlock(pmap_map);
2603                         if(m != VM_PAGE_NULL) {
2604                                 RELEASE_PAGE(m);
2605                                 UNLOCK_AND_DEALLOCATE;
2606                         } else {
2607                                 vm_object_deallocate(object);
2608                         }
2609                         goto RetryFault;
2610                 }
2611
2612                 /*
2613                  *      Check whether the protection has changed or the object
2614                  *      has been copied while we left the map unlocked.
2615                  */
2616                 prot &= retry_prot;
2617                 if(m != VM_PAGE_NULL) {
2618                         vm_object_unlock(m->object);
2619                 } else {
2620                         vm_object_unlock(object);
2621                 }
2622         }
2623         if(m != VM_PAGE_NULL) {
2624                 vm_object_lock(m->object);
2625         } else {
2626                 vm_object_lock(object);
2627         }
2628
2629         /*
2630          *      If the copy object changed while the top-level object
2631          *      was unlocked, then we must take away write permission.
2632          */
2633
2634         if(m != VM_PAGE_NULL) {
2635                 if (m->object->copy != old_copy_object)
2636                         prot &= ~VM_PROT_WRITE;
2637         }
2638
2639         /*
2640          *      If we want to wire down this page, but no longer have
2641          *      adequate permissions, we must start all over.
2642          */
2643
2644         if (wired && (fault_type != (prot|VM_PROT_WRITE))) {
2645                 vm_map_verify_done(map, &version);
2646                 if(pmap_map != map)
2647                         vm_map_unlock(pmap_map);
2648                 if(m != VM_PAGE_NULL) {
2649                         RELEASE_PAGE(m);
2650                         UNLOCK_AND_DEALLOCATE;
2651                 } else {
2652                         vm_object_deallocate(object);
2653                 }
2654                 goto RetryFault;
2655         }
2656
2657         /*
2658          *      Put this page into the physical map.
2659          *      We had to do the unlock above because pmap_enter
2660          *      may cause other faults.  The page may be on
2661          *      the pageout queues.  If the pageout daemon comes
2662          *      across the page, it will remove it from the queues.
2663          */
2664         if (m != VM_PAGE_NULL) {
2665                 if (m->no_isync == TRUE) {
2666                         pmap_sync_caches_phys(m->phys_addr);
2667
2668                         m->no_isync = FALSE;
2669                 }
2670
2671                 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2672
2673                 if(caller_pmap) {
2674                         PMAP_ENTER(caller_pmap,
2675                                         caller_pmap_addr, m,
2676                                         prot, cache_attr, wired);
2677                 } else {
2678                         PMAP_ENTER(pmap, vaddr, m,
2679                                         prot, cache_attr, wired);
2680                 }
2681                 {
2682                         tws_hash_line_t line;
2683                         task_t          task;
2684                         kern_return_t   kr;
2685
2686                            task = current_task();
2687                            if((map != NULL) &&
2688                                 (task->dynamic_working_set != 0)
2689                                         && (object->private)) {
2690                                 vm_object_t     base_object;
2691                                 vm_object_offset_t      base_offset;
2692                                 base_object = m->object;
2693                                 base_offset = m->offset;
2694                                 while(base_object->shadow) {
2695                                    base_offset +=
2696                                         base_object->shadow_offset;
2697                                    base_object =
2698                                         base_object->shadow;
2699                                 }
2700                                 kr = tws_lookup((tws_hash_t)
2701                                         task->dynamic_working_set,
2702                                         base_offset, base_object, &line);
2703                                 if(kr == KERN_OPERATION_TIMED_OUT){
2704                                         write_startup_file = 1;
2705                                 } else if (kr != KERN_SUCCESS) {
2706                                         tws_insert((tws_hash_t)
2707                                            task->dynamic_working_set,
2708                                            base_offset, base_object,
2709                                            vaddr, pmap_map);
2710                                         kr = tws_insert((tws_hash_t)
2711                                                    task->dynamic_working_set,
2712                                                    base_offset, base_object,
2713                                                    vaddr, pmap_map);
2714                                         if(kr == KERN_NO_SPACE) {
2715                                                 vm_object_unlock(m->object);
2716                                                 tws_expand_working_set(
2717                                                    task->dynamic_working_set,
2718                                                    TWS_HASH_LINE_COUNT,
2719                                                    FALSE);
2720                                                 vm_object_lock(m->object);
2721                                         }
2722                                         if(kr == KERN_OPERATION_TIMED_OUT) {
2723                                                 write_startup_file = 1;
2724                                         }
2725                                 }
2726                         }
2727                 }
2728         } else {
2729
2730 #ifndef i386
2731                 int                     memattr;
2732                 struct  phys_entry      *pp;
2733                 vm_map_entry_t          entry;
2734                 vm_offset_t             laddr;
2735                 vm_offset_t             ldelta, hdelta;
2736
2737                 /*
2738                  * do a pmap block mapping from the physical address
2739                  * in the object
2740                  */
2741                 if(pp = pmap_find_physentry(
2742                         (vm_offset_t)object->shadow_offset)) {
2743                         memattr = ((pp->pte1 & 0x00000078) >> 3);
2744                 } else {
2745                         memattr = VM_WIMG_MASK & (int)object->wimg_bits;
2746                 }
2747
2748
2749                 /* While we do not worry about execution protection in */
2750                 /* general, we may be able to read device memory and   */
2751                 /* still not be able to execute it.  Here we check for */
2752                 /* the guarded bit.  If its set and we are attempting  */
2753                 /* to execute, we return with a protection failure.    */
2754
2755                 if((memattr & VM_MEM_GUARDED) &&
2756                         (full_fault_type & VM_PROT_EXECUTE)) {
2757                         vm_map_verify_done(map, &version);
2758                         if(pmap_map != map)
2759                                 vm_map_unlock(pmap_map);
2760                         vm_fault_cleanup(object, top_page);
2761                         vm_object_deallocate(object);
2762                         kr = KERN_PROTECTION_FAILURE;
2763                         goto done;
2764                 }
2765
2766
2767
2768                 if(pmap_map != map) {
2769                         vm_map_unlock(pmap_map);
2770                 }
2771                 if (original_map != map) {
2772                         vm_map_unlock_read(map);
2773                         vm_map_lock_read(original_map);
2774                         map = original_map;
2775                 }
2776                 pmap_map = map;
2777
2778                 laddr = vaddr;
2779                 hdelta = 0xFFFFF000;
2780                 ldelta = 0xFFFFF000;
2781
2782
2783                 while(vm_map_lookup_entry(map, laddr, &entry)) {
2784                         if(ldelta > (laddr - entry->vme_start))
2785                                 ldelta = laddr - entry->vme_start;
2786                         if(hdelta > (entry->vme_end - laddr))
2787                                 hdelta = entry->vme_end - laddr;
2788                         if(entry->is_sub_map) {
2789
2790                                 laddr = (laddr - entry->vme_start)
2791                                                         + entry->offset;
2792                                 vm_map_lock_read(entry->object.sub_map);
2793                                 if(map != pmap_map)
2794                                         vm_map_unlock_read(map);
2795                                 if(entry->use_pmap) {
2796                                         vm_map_unlock_read(pmap_map);
2797                                         pmap_map = entry->object.sub_map;
2798                                 }
2799                                 map = entry->object.sub_map;
2800
2801                         } else {
2802                                 break;
2803                         }
2804                 }
2805
2806                 if(vm_map_lookup_entry(map, laddr, &entry) &&
2807                                         (entry->object.vm_object != NULL) &&
2808                                         (entry->object.vm_object == object)) {
2809
2810
2811                         if(caller_pmap) {
2812                                 pmap_map_block(caller_pmap,
2813                                         caller_pmap_addr - ldelta,
2814                                         ((vm_offset_t)
2815                                     (entry->object.vm_object->shadow_offset))
2816                                         + entry->offset +
2817                                         (laddr - entry->vme_start) - ldelta,
2818                                 ldelta + hdelta, prot,
2819                                 memattr, 0); /* Set up a block mapped area */
2820                         } else {
2821                                 pmap_map_block(pmap_map->pmap, vaddr - ldelta,
2822                                 ((vm_offset_t)
2823                                     (entry->object.vm_object->shadow_offset))
2824                                         + entry->offset +
2825                                         (laddr - entry->vme_start) - ldelta,
2826                                 ldelta + hdelta, prot,
2827                                 memattr, 0); /* Set up a block mapped area */
2828                         }
2829                 }
2830 #else
2831 #ifdef notyet
2832                 if(caller_pmap) {
2833                         pmap_enter(caller_pmap, caller_pmap_addr,
2834                                 object->shadow_offset, prot, 0, TRUE);
2835                 } else {
2836                         pmap_enter(pmap, vaddr,
2837                                 object->shadow_offset, prot, 0, TRUE);
2838                 }
2839                         /* Map it in */
2840 #endif
2841 #endif
2842
2843         }
2844
2845         /*
2846          *      If the page is not wired down and isn't already
2847          *      on a pageout queue, then put it where the
2848          *      pageout daemon can find it.
2849          */
2850         if(m != VM_PAGE_NULL) {
2851                 vm_page_lock_queues();
2852
2853                 if (change_wiring) {
2854                         if (wired)
2855                                 vm_page_wire(m);
2856                         else
2857                                 vm_page_unwire(m);
2858                 }
2859 #if     VM_FAULT_STATIC_CONFIG
2860                 else {
2861                         if (!m->active && !m->inactive)
2862                                 vm_page_activate(m);
2863                         m->reference = TRUE;
2864                 }
2865 #else
2866                 else if (software_reference_bits) {
2867                         if (!m->active && !m->inactive)
2868                                 vm_page_activate(m);
2869                         m->reference = TRUE;
2870                 } else {
2871                         vm_page_activate(m);
2872                 }
2873 #endif
2874                 vm_page_unlock_queues();
2875         }
2876
2877         /*
2878          *      Unlock everything, and return
2879          */
2880
2881         vm_map_verify_done(map, &version);
2882         if(pmap_map != map)
2883                 vm_map_unlock(pmap_map);
2884         if(m != VM_PAGE_NULL) {
2885                 PAGE_WAKEUP_DONE(m);
2886                 UNLOCK_AND_DEALLOCATE;
2887         } else {
2888                 vm_fault_cleanup(object, top_page);
2889                 vm_object_deallocate(object);
2890         }
2891         kr = KERN_SUCCESS;
2892
2893 #undef  UNLOCK_AND_DEALLOCATE
2894 #undef  RELEASE_PAGE
2895
2896     done:
2897         if(write_startup_file)
2898                 tws_send_startup_info(current_task());
2899         if (funnel_set) {
2900                 thread_funnel_set( curflock, TRUE);
2901                 funnel_set = FALSE;
2902         }
2903         thread_interrupt_level(interruptible_state);
2904
2905         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2906                               vaddr,
2907                               type_of_fault & 0xff,
2908                               kr,
2909                               type_of_fault >> 8,
2910                               0);
2911
2912         return(kr);
2913 }
2914
2915 /*
2916  *      vm_fault_wire:
2917  *
2918  *      Wire down a range of virtual addresses in a map.
2919  */
2920 kern_return_t
2921 vm_fault_wire(
2922         vm_map_t        map,
2923         vm_map_entry_t  entry,
2924         pmap_t          pmap,
2925         vm_offset_t     pmap_addr)
2926 {
2927
2928         register vm_offset_t    va;
2929         register vm_offset_t    end_addr = entry->vme_end;
2930         register kern_return_t  rc;
2931
2932         assert(entry->in_transition);
2933
2934         if ((entry->object.vm_object != NULL) &&
2935                         !entry->is_sub_map &&
2936                         entry->object.vm_object->phys_contiguous) {
2937                 return KERN_SUCCESS;
2938         }
2939
2940         /*
2941          *      Inform the physical mapping system that the
2942          *      range of addresses may not fault, so that
2943          *      page tables and such can be locked down as well.
2944          */
2945
2946         pmap_pageable(pmap, pmap_addr,
2947                 pmap_addr + (end_addr - entry->vme_start), FALSE);
2948
2949         /*
2950          *      We simulate a fault to get the page and enter it
2951          *      in the physical map.
2952          */
2953
2954         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
2955                 if ((rc = vm_fault_wire_fast(
2956                         map, va, entry, pmap,
2957                         pmap_addr + (va - entry->vme_start)
2958                         )) != KERN_SUCCESS) {
2959                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
2960                                 (pmap == kernel_pmap) ?
2961                                         THREAD_UNINT : THREAD_ABORTSAFE,
2962                                 pmap, pmap_addr + (va - entry->vme_start));
2963                 }
2964
2965                 if (rc != KERN_SUCCESS) {
2966                         struct vm_map_entry     tmp_entry = *entry;
2967
2968                         /* unwire wired pages */
2969                         tmp_entry.vme_end = va;
2970                         vm_fault_unwire(map,
2971                                 &tmp_entry, FALSE, pmap, pmap_addr);
2972
2973                         return rc;
2974                 }
2975         }
2976         return KERN_SUCCESS;
2977 }
2978
2979 /*
2980  *      vm_fault_unwire:
2981  *
2982  *      Unwire a range of virtual addresses in a map.
2983  */
2984 void
2985 vm_fault_unwire(
2986         vm_map_t        map,
2987         vm_map_entry_t  entry,
2988         boolean_t       deallocate,
2989         pmap_t          pmap,
2990         vm_offset_t     pmap_addr)
2991 {
2992         register vm_offset_t    va;
2993         register vm_offset_t    end_addr = entry->vme_end;
2994         vm_object_t             object;
2995
2996         object = (entry->is_sub_map)
2997                         ? VM_OBJECT_NULL : entry->object.vm_object;
2998
2999         /*
3000          *      Since the pages are wired down, we must be able to
3001          *      get their mappings from the physical map system.
3002          */
3003
3004         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3005                 pmap_change_wiring(pmap,
3006                         pmap_addr + (va - entry->vme_start), FALSE);
3007
3008                 if (object == VM_OBJECT_NULL) {
3009                         (void) vm_fault(map, va, VM_PROT_NONE,
3010                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
3011                 } else if (object->phys_contiguous) {
3012                         continue;
3013                 } else {
3014                         vm_prot_t       prot;
3015                         vm_page_t       result_page;
3016                         vm_page_t       top_page;
3017                         vm_object_t     result_object;
3018                         vm_fault_return_t result;
3019
3020                         do {
3021                                 prot = VM_PROT_NONE;
3022
3023                                 vm_object_lock(object);
3024                                 vm_object_paging_begin(object);
3025                                 XPR(XPR_VM_FAULT,
3026                                         "vm_fault_unwire -> vm_fault_page\n",
3027                                         0,0,0,0,0);
3028                                 result = vm_fault_page(object,
3029                                                 entry->offset +
3030                                                   (va - entry->vme_start),
3031                                                 VM_PROT_NONE, TRUE,
3032                                                 THREAD_UNINT,
3033                                                 entry->offset,
3034                                                 entry->offset +
3035                                                        (entry->vme_end
3036                                                         - entry->vme_start),
3037                                                 entry->behavior,
3038                                                 &prot,
3039                                                 &result_page,
3040                                                 &top_page,
3041                                                 (int *)0,
3042                                                 0, map->no_zero_fill,
3043                                                 FALSE, NULL, 0);
3044                         } while (result == VM_FAULT_RETRY);
3045
3046                         if (result != VM_FAULT_SUCCESS)
3047                                 panic("vm_fault_unwire: failure");
3048
3049                         result_object = result_page->object;
3050                         if (deallocate) {
3051                                 assert(!result_page->fictitious);
3052                                 pmap_page_protect(result_page->phys_addr,
3053                                                 VM_PROT_NONE);
3054                                 VM_PAGE_FREE(result_page);
3055                         } else {
3056                                 vm_page_lock_queues();
3057                                 vm_page_unwire(result_page);
3058                                 vm_page_unlock_queues();
3059                                 PAGE_WAKEUP_DONE(result_page);
3060                         }
3061
3062                         vm_fault_cleanup(result_object, top_page);
3063                 }
3064         }
3065
3066         /*
3067          *      Inform the physical mapping system that the range
3068          *      of addresses may fault, so that page tables and
3069          *      such may be unwired themselves.
3070          */
3071
3072         pmap_pageable(pmap, pmap_addr,
3073                 pmap_addr + (end_addr - entry->vme_start), TRUE);
3074
3075 }
3076
3077 /*
3078  *      vm_fault_wire_fast:
3079  *
3080  *      Handle common case of a wire down page fault at the given address.
3081  *      If successful, the page is inserted into the associated physical map.
3082  *      The map entry is passed in to avoid the overhead of a map lookup.
3083  *
3084  *      NOTE: the given address should be truncated to the
3085  *      proper page address.
3086  *
3087  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
3088  *      a standard error specifying why the fault is fatal is returned.
3089  *
3090  *      The map in question must be referenced, and remains so.
3091  *      Caller has a read lock on the map.
3092  *
3093  *      This is a stripped version of vm_fault() for wiring pages.  Anything
3094  *      other than the common case will return KERN_FAILURE, and the caller
3095  *      is expected to call vm_fault().
3096  */
3097 kern_return_t
3098 vm_fault_wire_fast(
3099         vm_map_t        map,
3100         vm_offset_t     va,
3101         vm_map_entry_t  entry,
3102         pmap_t          pmap,
3103         vm_offset_t     pmap_addr)
3104 {
3105         vm_object_t             object;
3106         vm_object_offset_t      offset;
3107         register vm_page_t      m;
3108         vm_prot_t               prot;
3109         thread_act_t            thr_act;
3110         unsigned int            cache_attr;
3111
3112         VM_STAT(faults++);
3113
3114         if((thr_act=current_act()) && (thr_act->task != TASK_NULL))
3115           thr_act->task->faults++;
3116
3117 /*
3118  *      Recovery actions
3119  */
3120
3121 #undef  RELEASE_PAGE
3122 #define RELEASE_PAGE(m) {                               \
3123         PAGE_WAKEUP_DONE(m);                            \
3124         vm_page_lock_queues();                          \
3125         vm_page_unwire(m);                              \
3126         vm_page_unlock_queues();                        \
3127 }
3128
3129
3130 #undef  UNLOCK_THINGS
3131 #define UNLOCK_THINGS   {                               \
3132         object->paging_in_progress--;                   \
3133         vm_object_unlock(object);                       \
3134 }
3135
3136 #undef  UNLOCK_AND_DEALLOCATE
3137 #define UNLOCK_AND_DEALLOCATE   {                       \
3138         UNLOCK_THINGS;                                  \
3139         vm_object_deallocate(object);                   \
3140 }
3141 /*
3142  *      Give up and have caller do things the hard way.
3143  */
3144
3145 #define GIVE_UP {                                       \
3146         UNLOCK_AND_DEALLOCATE;                          \
3147         return(KERN_FAILURE);                           \
3148 }
3149
3150
3151         /*
3152          *      If this entry is not directly to a vm_object, bail out.
3153          */
3154         if (entry->is_sub_map)
3155                 return(KERN_FAILURE);
3156
3157         /*
3158          *      Find the backing store object and offset into it.
3159          */
3160
3161         object = entry->object.vm_object;
3162         offset = (va - entry->vme_start) + entry->offset;
3163         prot = entry->protection;
3164
3165         /*
3166          *      Make a reference to this object to prevent its
3167          *      disposal while we are messing with it.
3168          */
3169
3170         vm_object_lock(object);
3171         assert(object->ref_count > 0);
3172         object->ref_count++;
3173         vm_object_res_reference(object);
3174         object->paging_in_progress++;
3175
3176         /*
3177          *      INVARIANTS (through entire routine):
3178          *
3179          *      1)      At all times, we must either have the object
3180          *              lock or a busy page in some object to prevent
3181          *              some other thread from trying to bring in
3182          *              the same page.
3183          *
3184          *      2)      Once we have a busy page, we must remove it from
3185          *              the pageout queues, so that the pageout daemon
3186          *              will not grab it away.
3187          *
3188          */
3189
3190         /*
3191          *      Look for page in top-level object.  If it's not there or
3192          *      there's something going on, give up.
3193          */
3194         m = vm_page_lookup(object, offset);
3195         if ((m == VM_PAGE_NULL) || (m->busy) ||
3196             (m->unusual && ( m->error || m->restart || m->absent ||
3197                                 prot & m->page_lock))) {
3198
3199                 GIVE_UP;
3200         }
3201
3202         /*
3203          *      Wire the page down now.  All bail outs beyond this
3204          *      point must unwire the page.
3205          */
3206
3207         vm_page_lock_queues();
3208         vm_page_wire(m);
3209         vm_page_unlock_queues();
3210
3211         /*
3212          *      Mark page busy for other threads.
3213          */
3214         assert(!m->busy);
3215         m->busy = TRUE;
3216         assert(!m->absent);
3217
3218         /*
3219          *      Give up if the page is being written and there's a copy object
3220          */
3221         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3222                 RELEASE_PAGE(m);
3223                 GIVE_UP;
3224         }
3225
3226         /*
3227          *      Put this page into the physical map.
3228          *      We have to unlock the object because pmap_enter
3229          *      may cause other faults.
3230          */
3231         if (m->no_isync == TRUE) {
3232                 pmap_sync_caches_phys(m->phys_addr);
3233
3234                 m->no_isync = FALSE;
3235         }
3236
3237         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3238
3239         PMAP_ENTER(pmap, pmap_addr, m, prot, cache_attr, TRUE);
3240
3241         /*
3242          *      Unlock everything, and return
3243          */
3244
3245         PAGE_WAKEUP_DONE(m);
3246         UNLOCK_AND_DEALLOCATE;
3247
3248         return(KERN_SUCCESS);
3249
3250 }
3251
3252 /*
3253  *      Routine:        vm_fault_copy_cleanup
3254  *      Purpose:
3255  *              Release a page used by vm_fault_copy.
3256  */
3257
3258 void
3259 vm_fault_copy_cleanup(
3260         vm_page_t       page,
3261         vm_page_t       top_page)
3262 {
3263         vm_object_t     object = page->object;
3264
3265         vm_object_lock(object);
3266         PAGE_WAKEUP_DONE(page);
3267         vm_page_lock_queues();
3268         if (!page->active && !page->inactive)
3269                 vm_page_activate(page);
3270         vm_page_unlock_queues();
3271         vm_fault_cleanup(object, top_page);
3272 }
3273
3274 void
3275 vm_fault_copy_dst_cleanup(
3276         vm_page_t       page)
3277 {
3278         vm_object_t     object;
3279
3280         if (page != VM_PAGE_NULL) {
3281                 object = page->object;
3282                 vm_object_lock(object);
3283                 vm_page_lock_queues();
3284                 vm_page_unwire(page);
3285                 vm_page_unlock_queues();
3286                 vm_object_paging_end(object);
3287                 vm_object_unlock(object);
3288         }
3289 }
3290
3291 /*
3292  *      Routine:        vm_fault_copy
3293  *
3294  *      Purpose:
3295  *              Copy pages from one virtual memory object to another --
3296  *              neither the source nor destination pages need be resident.
3297  *
3298  *              Before actually copying a page, the version associated with
3299  *              the destination address map wil be verified.
3300  *
3301  *      In/out conditions:
3302  *              The caller must hold a reference, but not a lock, to
3303  *              each of the source and destination objects and to the
3304  *              destination map.
3305  *
3306  *      Results:
3307  *              Returns KERN_SUCCESS if no errors were encountered in
3308  *              reading or writing the data.  Returns KERN_INTERRUPTED if
3309  *              the operation was interrupted (only possible if the
3310  *              "interruptible" argument is asserted).  Other return values
3311  *              indicate a permanent error in copying the data.
3312  *
3313  *              The actual amount of data copied will be returned in the
3314  *              "copy_size" argument.  In the event that the destination map
3315  *              verification failed, this amount may be less than the amount
3316  *              requested.
3317  */
3318 kern_return_t
3319 vm_fault_copy(
3320         vm_object_t             src_object,
3321         vm_object_offset_t      src_offset,
3322         vm_size_t               *src_size,              /* INOUT */
3323         vm_object_t             dst_object,
3324         vm_object_offset_t      dst_offset,
3325         vm_map_t                dst_map,
3326         vm_map_version_t         *dst_version,
3327         int                     interruptible)
3328 {
3329         vm_page_t               result_page;
3330
3331         vm_page_t               src_page;
3332         vm_page_t               src_top_page;
3333         vm_prot_t               src_prot;
3334
3335         vm_page_t               dst_page;
3336         vm_page_t               dst_top_page;
3337         vm_prot_t               dst_prot;
3338
3339         vm_size_t               amount_left;
3340         vm_object_t             old_copy_object;
3341         kern_return_t           error = 0;
3342
3343         vm_size_t               part_size;
3344
3345         /*
3346          * In order not to confuse the clustered pageins, align
3347          * the different offsets on a page boundary.
3348          */
3349         vm_object_offset_t      src_lo_offset = trunc_page_64(src_offset);
3350         vm_object_offset_t      dst_lo_offset = trunc_page_64(dst_offset);
3351         vm_object_offset_t      src_hi_offset = round_page_64(src_offset + *src_size);
3352         vm_object_offset_t      dst_hi_offset = round_page_64(dst_offset + *src_size);
3353
3354 #define RETURN(x)                                       \
3355         MACRO_BEGIN                                     \
3356         *src_size -= amount_left;                       \
3357         MACRO_RETURN(x);                                \
3358         MACRO_END
3359
3360         amount_left = *src_size;
3361         do { /* while (amount_left > 0) */
3362                 /*
3363                  * There may be a deadlock if both source and destination
3364                  * pages are the same. To avoid this deadlock, the copy must
3365                  * start by getting the destination page in order to apply
3366                  * COW semantics if any.
3367                  */
3368
3369         RetryDestinationFault: ;
3370
3371                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3372
3373                 vm_object_lock(dst_object);
3374                 vm_object_paging_begin(dst_object);
3375
3376                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3377                 switch (vm_fault_page(dst_object,
3378                                       trunc_page_64(dst_offset),
3379                                       VM_PROT_WRITE|VM_PROT_READ,
3380                                       FALSE,
3381                                       interruptible,
3382                                       dst_lo_offset,
3383                                       dst_hi_offset,
3384                                       VM_BEHAVIOR_SEQUENTIAL,
3385                                       &dst_prot,
3386                                       &dst_page,
3387                                       &dst_top_page,
3388                                       (int *)0,
3389                                       &error,
3390                                       dst_map->no_zero_fill,
3391                                       FALSE, NULL, 0)) {
3392                 case VM_FAULT_SUCCESS:
3393                         break;
3394                 case VM_FAULT_RETRY:
3395                         goto RetryDestinationFault;
3396                 case VM_FAULT_MEMORY_SHORTAGE:
3397                         if (vm_page_wait(interruptible))
3398                                 goto RetryDestinationFault;
3399                         /* fall thru */
3400                 case VM_FAULT_INTERRUPTED:
3401                         RETURN(MACH_SEND_INTERRUPTED);
3402                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3403                         vm_page_more_fictitious();
3404                         goto RetryDestinationFault;
3405                 case VM_FAULT_MEMORY_ERROR:
3406                         if (error)
3407                                 return (error);
3408                         else
3409                                 return(KERN_MEMORY_ERROR);
3410                 }
3411                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3412
3413                 old_copy_object = dst_page->object->copy;
3414
3415                 /*
3416                  * There exists the possiblity that the source and
3417                  * destination page are the same.  But we can't
3418                  * easily determine that now.  If they are the
3419                  * same, the call to vm_fault_page() for the
3420                  * destination page will deadlock.  To prevent this we
3421                  * wire the page so we can drop busy without having
3422                  * the page daemon steal the page.  We clean up the
3423                  * top page  but keep the paging reference on the object
3424                  * holding the dest page so it doesn't go away.
3425                  */
3426
3427                 vm_page_lock_queues();
3428                 vm_page_wire(dst_page);
3429                 vm_page_unlock_queues();
3430                 PAGE_WAKEUP_DONE(dst_page);
3431                 vm_object_unlock(dst_page->object);
3432
3433                 if (dst_top_page != VM_PAGE_NULL) {
3434                         vm_object_lock(dst_object);
3435                         VM_PAGE_FREE(dst_top_page);
3436                         vm_object_paging_end(dst_object);
3437                         vm_object_unlock(dst_object);
3438                 }
3439
3440         RetrySourceFault: ;
3441
3442                 if (src_object == VM_OBJECT_NULL) {
3443                         /*
3444                          *      No source object.  We will just
3445                          *      zero-fill the page in dst_object.
3446                          */
3447                         src_page = VM_PAGE_NULL;
3448                         result_page = VM_PAGE_NULL;
3449                 } else {
3450                         vm_object_lock(src_object);
3451                         src_page = vm_page_lookup(src_object,
3452                                                   trunc_page_64(src_offset));
3453                         if (src_page == dst_page) {
3454                                 src_prot = dst_prot;
3455                                 result_page = VM_PAGE_NULL;
3456                         } else {
3457                                 src_prot = VM_PROT_READ;
3458                                 vm_object_paging_begin(src_object);
3459
3460                                 XPR(XPR_VM_FAULT,
3461                                         "vm_fault_copy(2) -> vm_fault_page\n",
3462                                         0,0,0,0,0);
3463                                 switch (vm_fault_page(src_object,
3464                                                       trunc_page_64(src_offset),
3465                                                       VM_PROT_READ,
3466                                                       FALSE,
3467                                                       interruptible,
3468                                                       src_lo_offset,
3469                                                       src_hi_offset,
3470                                                       VM_BEHAVIOR_SEQUENTIAL,
3471                                                       &src_prot,
3472                                                       &result_page,
3473                                                       &src_top_page,
3474                                                       (int *)0,
3475                                                       &error,
3476                                                       FALSE,
3477                                                       FALSE, NULL, 0)) {
3478
3479                                 case VM_FAULT_SUCCESS:
3480                                         break;
3481                                 case VM_FAULT_RETRY:
3482                                         goto RetrySourceFault;
3483                                 case VM_FAULT_MEMORY_SHORTAGE:
3484                                         if (vm_page_wait(interruptible))
3485                                                 goto RetrySourceFault;
3486                                         /* fall thru */
3487                                 case VM_FAULT_INTERRUPTED:
3488                                         vm_fault_copy_dst_cleanup(dst_page);
3489                                         RETURN(MACH_SEND_INTERRUPTED);
3490                                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3491                                         vm_page_more_fictitious();
3492                                         goto RetrySourceFault;
3493                                 case VM_FAULT_MEMORY_ERROR:
3494                                         vm_fault_copy_dst_cleanup(dst_page);
3495                                         if (error)
3496                                                 return (error);
3497                                         else
3498                                                 return(KERN_MEMORY_ERROR);
3499                                 }
3500
3501
3502                                 assert((src_top_page == VM_PAGE_NULL) ==
3503                                        (result_page->object == src_object));
3504                         }
3505                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3506                         vm_object_unlock(result_page->object);
3507                 }
3508
3509                 if (!vm_map_verify(dst_map, dst_version)) {
3510                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3511                                 vm_fault_copy_cleanup(result_page, src_top_page);
3512                         vm_fault_copy_dst_cleanup(dst_page);
3513                         break;
3514                 }
3515
3516                 vm_object_lock(dst_page->object);
3517
3518                 if (dst_page->object->copy != old_copy_object) {
3519                         vm_object_unlock(dst_page->object);
3520                         vm_map_verify_done(dst_map, dst_version);
3521                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3522                                 vm_fault_copy_cleanup(result_page, src_top_page);
3523                         vm_fault_copy_dst_cleanup(dst_page);
3524                         break;
3525                 }
3526                 vm_object_unlock(dst_page->object);
3527
3528                 /*
3529                  *      Copy the page, and note that it is dirty
3530                  *      immediately.
3531                  */
3532
3533                 if (!page_aligned(src_offset) ||
3534                         !page_aligned(dst_offset) ||
3535                         !page_aligned(amount_left)) {
3536
3537                         vm_object_offset_t      src_po,
3538                                                 dst_po;
3539
3540                         src_po = src_offset - trunc_page_64(src_offset);
3541                         dst_po = dst_offset - trunc_page_64(dst_offset);
3542
3543                         if (dst_po > src_po) {
3544                                 part_size = PAGE_SIZE - dst_po;
3545                         } else {
3546                                 part_size = PAGE_SIZE - src_po;
3547                         }
3548                         if (part_size > (amount_left)){
3549                                 part_size = amount_left;
3550                         }
3551
3552                         if (result_page == VM_PAGE_NULL) {
3553                                 vm_page_part_zero_fill(dst_page,
3554                                                         dst_po, part_size);
3555                         } else {
3556                                 vm_page_part_copy(result_page, src_po,
3557                                         dst_page, dst_po, part_size);
3558                                 if(!dst_page->dirty){
3559                                         vm_object_lock(dst_object);
3560                                         dst_page->dirty = TRUE;
3561                                         vm_object_unlock(dst_page->object);
3562                                 }
3563
3564                         }
3565                 } else {
3566                         part_size = PAGE_SIZE;
3567
3568                         if (result_page == VM_PAGE_NULL)
3569                                 vm_page_zero_fill(dst_page);
3570                         else{
3571                                 vm_page_copy(result_page, dst_page);
3572                                 if(!dst_page->dirty){
3573                                         vm_object_lock(dst_object);
3574                                         dst_page->dirty = TRUE;
3575                                         vm_object_unlock(dst_page->object);
3576                                 }
3577                         }
3578
3579                 }
3580
3581                 /*
3582                  *      Unlock everything, and return
3583                  */
3584
3585                 vm_map_verify_done(dst_map, dst_version);
3586
3587                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3588                         vm_fault_copy_cleanup(result_page, src_top_page);
3589                 vm_fault_copy_dst_cleanup(dst_page);
3590
3591                 amount_left -= part_size;
3592                 src_offset += part_size;
3593                 dst_offset += part_size;
3594         } while (amount_left > 0);
3595
3596         RETURN(KERN_SUCCESS);
3597 #undef  RETURN
3598
3599         /*NOTREACHED*/
3600 }
3601
3602 #ifdef  notdef
3603
3604 /*
3605  *      Routine:        vm_fault_page_overwrite
3606  *
3607  *      Description:
3608  *              A form of vm_fault_page that assumes that the
3609  *              resulting page will be overwritten in its entirety,
3610  *              making it unnecessary to obtain the correct *contents*
3611  *              of the page.
3612  *
3613  *      Implementation:
3614  *              XXX Untested.  Also unused.  Eventually, this technology
3615  *              could be used in vm_fault_copy() to advantage.
3616  */
3617 vm_fault_return_t
3618 vm_fault_page_overwrite(
3619         register
3620         vm_object_t             dst_object,
3621         vm_object_offset_t      dst_offset,
3622         vm_page_t               *result_page)   /* OUT */
3623 {
3624         register
3625         vm_page_t       dst_page;
3626         kern_return_t   wait_result;
3627
3628 #define interruptible   THREAD_UNINT    /* XXX */
3629
3630         while (TRUE) {
3631                 /*
3632                  *      Look for a page at this offset
3633                  */
3634
3635                 while ((dst_page = vm_page_lookup(dst_object, dst_offset))
3636                                  == VM_PAGE_NULL) {
3637                         /*
3638                          *      No page, no problem... just allocate one.
3639                          */
3640
3641                         dst_page = vm_page_alloc(dst_object, dst_offset);
3642                         if (dst_page == VM_PAGE_NULL) {
3643                                 vm_object_unlock(dst_object);
3644                                 VM_PAGE_WAIT();
3645                                 vm_object_lock(dst_object);
3646                                 continue;
3647                         }
3648
3649                         /*
3650                          *      Pretend that the memory manager
3651                          *      write-protected the page.
3652                          *
3653                          *      Note that we will be asking for write
3654                          *      permission without asking for the data
3655                          *      first.
3656                          */
3657
3658                         dst_page->overwriting = TRUE;
3659                         dst_page->page_lock = VM_PROT_WRITE;
3660                         dst_page->absent = TRUE;
3661                         dst_page->unusual = TRUE;
3662                         dst_object->absent_count++;
3663
3664                         break;
3665
3666                         /*
3667                          *      When we bail out, we might have to throw
3668                          *      away the page created here.
3669                          */
3670
3671 #define DISCARD_PAGE                                            \
3672         MACRO_BEGIN                                             \
3673         vm_object_lock(dst_object);                             \
3674         dst_page = vm_page_lookup(dst_object, dst_offset);      \
3675         if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \
3676                 VM_PAGE_FREE(dst_page);                         \
3677         vm_object_unlock(dst_object);                           \
3678         MACRO_END
3679                 }
3680
3681                 /*
3682                  *      If the page is write-protected...
3683                  */
3684
3685                 if (dst_page->page_lock & VM_PROT_WRITE) {
3686                         /*
3687                          *      ... and an unlock request hasn't been sent
3688                          */
3689
3690                         if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) {
3691                                 vm_prot_t       u;
3692                                 kern_return_t   rc;
3693
3694                                 /*
3695                                  *      ... then send one now.
3696                                  */
3697
3698                                 if (!dst_object->pager_ready) {
3699                                         wait_result = vm_object_assert_wait(dst_object,
3700                                                                 VM_OBJECT_EVENT_PAGER_READY,
3701                                                                 interruptible);
3702                                         vm_object_unlock(dst_object);
3703                                         if (wait_result == THREAD_WAITING)
3704                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
3705                                         if (wait_result != THREAD_AWAKENED) {
3706                                                 DISCARD_PAGE;
3707                                                 return(VM_FAULT_INTERRUPTED);
3708                                         }
3709                                         continue;
3710                                 }
3711
3712                                 u = dst_page->unlock_request |= VM_PROT_WRITE;
3713                                 vm_object_unlock(dst_object);
3714
3715                                 if ((rc = memory_object_data_unlock(
3716                                                 dst_object->pager,
3717                                                 dst_offset + dst_object->paging_offset,
3718                                                 PAGE_SIZE,
3719                                                 u)) != KERN_SUCCESS) {
3720                                         if (vm_fault_debug)
3721                                             printf("vm_object_overwrite: memory_object_data_unlock failed\n");
3722                                         DISCARD_PAGE;
3723                                         return((rc == MACH_SEND_INTERRUPTED) ?
3724                                                 VM_FAULT_INTERRUPTED :
3725                                                 VM_FAULT_MEMORY_ERROR);
3726                                 }
3727                                 vm_object_lock(dst_object);
3728                                 continue;
3729                         }
3730
3731                         /* ... fall through to wait below */
3732                 } else {
3733                         /*
3734                          *      If the page isn't being used for other
3735                          *      purposes, then we're done.
3736                          */
3737                         if ( ! (dst_page->busy || dst_page->absent ||
3738                                 dst_page->error || dst_page->restart) )
3739                                 break;
3740                 }
3741
3742                 wait_result = PAGE_ASSERT_WAIT(dst_page, interruptible);
3743                 vm_object_unlock(dst_object);
3744                 if (wait_result == THREAD_WAITING)
3745                         wait_result = thread_block(THREAD_CONTINUE_NULL);
3746                 if (wait_result != THREAD_AWAKENED) {
3747                         DISCARD_PAGE;
3748                         return(VM_FAULT_INTERRUPTED);
3749                 }
3750         }
3751
3752         *result_page = dst_page;
3753         return(VM_FAULT_SUCCESS);
3754
3755 #undef  interruptible
3756 #undef  DISCARD_PAGE
3757 }
3758
3759 #endif  /* notdef */
3760
3761 #if     VM_FAULT_CLASSIFY
3762 /*
3763  *      Temporary statistics gathering support.
3764  */
3765
3766 /*
3767  *      Statistics arrays:
3768  */
3769 #define VM_FAULT_TYPES_MAX      5
3770 #define VM_FAULT_LEVEL_MAX      8
3771
3772 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
3773
3774 #define VM_FAULT_TYPE_ZERO_FILL 0
3775 #define VM_FAULT_TYPE_MAP_IN    1
3776 #define VM_FAULT_TYPE_PAGER     2
3777 #define VM_FAULT_TYPE_COPY      3
3778 #define VM_FAULT_TYPE_OTHER     4
3779
3780
3781 void
3782 vm_fault_classify(vm_object_t           object,
3783                   vm_object_offset_t    offset,
3784                   vm_prot_t             fault_type)
3785 {
3786         int             type, level = 0;
3787         vm_page_t       m;
3788
3789         while (TRUE) {
3790                 m = vm_page_lookup(object, offset);
3791                 if (m != VM_PAGE_NULL) {
3792                         if (m->busy || m->error || m->restart || m->absent ||
3793                             fault_type & m->page_lock) {
3794                                 type = VM_FAULT_TYPE_OTHER;
3795                                 break;
3796                         }
3797                         if (((fault_type & VM_PROT_WRITE) == 0) ||
3798                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
3799                                 type = VM_FAULT_TYPE_MAP_IN;
3800                                 break;
3801                         }
3802                         type = VM_FAULT_TYPE_COPY;
3803                         break;
3804                 }
3805                 else {
3806                         if (object->pager_created) {
3807                                 type = VM_FAULT_TYPE_PAGER;
3808                                 break;
3809                         }
3810                         if (object->shadow == VM_OBJECT_NULL) {
3811                                 type = VM_FAULT_TYPE_ZERO_FILL;
3812                                 break;
3813                         }
3814
3815                         offset += object->shadow_offset;
3816                         object = object->shadow;
3817                         level++;
3818                         continue;
3819                 }
3820         }
3821
3822         if (level > VM_FAULT_LEVEL_MAX)
3823                 level = VM_FAULT_LEVEL_MAX;
3824
3825         vm_fault_stats[type][level] += 1;
3826
3827         return;
3828 }
3829
3830 /* cleanup routine to call from debugger */
3831
3832 void
3833 vm_fault_classify_init(void)
3834 {
3835         int type, level;
3836
3837         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
3838                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
3839                         vm_fault_stats[type][level] = 0;
3840                 }
3841         }
3842
3843         return;
3844 }
3845 #endif  /* VM_FAULT_CLASSIFY */