osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 /*
  26  * @OSF_COPYRIGHT@
  27  */
  28 /*
  29  * Mach Operating System
  30  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  31  * All Rights Reserved.
  32  *
  33  * Permission to use, copy, modify and distribute this software and its
  34  * documentation is hereby granted, provided that both the copyright
  35  * notice and this permission notice appear in all copies of the
  36  * software, derivative works or modified versions, and any portions
  37  * thereof, and that both notices appear in supporting documentation.
  38  *
  39  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  40  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  41  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  42  *
  43  * Carnegie Mellon requests users of this software to return to
  44  *
  45  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  46  *  School of Computer Science
  47  *  Carnegie Mellon University
  48  *  Pittsburgh PA 15213-3890
  49  *
  50  * any improvements or extensions that they make and grant Carnegie Mellon
  51  * the rights to redistribute these changes.
  52  */
  53 /*
  54  */
  55 /*
  56  *      File:   vm_fault.c
  57  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  58  *
  59  *      Page fault handling module.
  60  */
  61 #ifdef MACH_BSD
  62 /* remove after component interface available */
  63 extern int      vnode_pager_workaround;
  64 extern int      device_pager_workaround;
  65 #endif
  66
  67 #include <mach_cluster_stats.h>
  68 #include <mach_pagemap.h>
  69 #include <mach_kdb.h>
  70
  71 #include <vm/vm_fault.h>
  72 #include <mach/kern_return.h>
  73 #include <mach/message.h>       /* for error codes */
  74 #include <kern/host_statistics.h>
  75 #include <kern/counters.h>
  76 #include <kern/task.h>
  77 #include <kern/thread.h>
  78 #include <kern/sched_prim.h>
  79 #include <kern/host.h>
  80 #include <kern/xpr.h>
  81 #include <ppc/proc_reg.h>
  82 #include <vm/task_working_set.h>
  83 #include <vm/vm_map.h>
  84 #include <vm/vm_object.h>
  85 #include <vm/vm_page.h>
  86 #include <vm/pmap.h>
  87 #include <vm/vm_pageout.h>
  88 #include <mach/vm_param.h>
  89 #include <mach/vm_behavior.h>
  90 #include <mach/memory_object.h>
  91                                 /* For memory_object_data_{request,unlock} */
  92 #include <kern/mach_param.h>
  93 #include <kern/macro_help.h>
  94 #include <kern/zalloc.h>
  95 #include <kern/misc_protos.h>
  96
  97 #include <sys/kdebug.h>
  98
  99 #define VM_FAULT_CLASSIFY       0
 100 #define VM_FAULT_STATIC_CONFIG  1
 101
 102 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 103
 104 int             vm_object_absent_max = 50;
 105
 106 int             vm_fault_debug = 0;
 107 boolean_t       vm_page_deactivate_behind = TRUE;
 108
 109
 110 #if     !VM_FAULT_STATIC_CONFIG
 111 boolean_t       vm_fault_dirty_handling = FALSE;
 112 boolean_t       vm_fault_interruptible = FALSE;
 113 boolean_t       software_reference_bits = TRUE;
 114 #endif
 115
 116 #if     MACH_KDB
 117 extern struct db_watchpoint *db_watchpoint_list;
 118 #endif  /* MACH_KDB */
 119
 120 /* Forward declarations of internal routines. */
 121 extern kern_return_t vm_fault_wire_fast(
 122                                 vm_map_t        map,
 123                                 vm_offset_t     va,
 124                                 vm_map_entry_t  entry,
 125                                 pmap_t          pmap,
 126                                 vm_offset_t     pmap_addr);
 127
 128 extern void vm_fault_continue(void);
 129
 130 extern void vm_fault_copy_cleanup(
 131                                 vm_page_t       page,
 132                                 vm_page_t       top_page);
 133
 134 extern void vm_fault_copy_dst_cleanup(
 135                                 vm_page_t       page);
 136
 137 #if     VM_FAULT_CLASSIFY
 138 extern void vm_fault_classify(vm_object_t       object,
 139                           vm_object_offset_t    offset,
 140                           vm_prot_t             fault_type);
 141
 142 extern void vm_fault_classify_init(void);
 143 #endif
 144
 145 /*
 146  *      Routine:        vm_fault_init
 147  *      Purpose:
 148  *              Initialize our private data structures.
 149  */
 150 void
 151 vm_fault_init(void)
 152 {
 153 }
 154
 155 /*
 156  *      Routine:        vm_fault_cleanup
 157  *      Purpose:
 158  *              Clean up the result of vm_fault_page.
 159  *      Results:
 160  *              The paging reference for "object" is released.
 161  *              "object" is unlocked.
 162  *              If "top_page" is not null,  "top_page" is
 163  *              freed and the paging reference for the object
 164  *              containing it is released.
 165  *
 166  *      In/out conditions:
 167  *              "object" must be locked.
 168  */
 169 void
 170 vm_fault_cleanup(
 171         register vm_object_t    object,
 172         register vm_page_t      top_page)
 173 {
 174         vm_object_paging_end(object);
 175         vm_object_unlock(object);
 176
 177         if (top_page != VM_PAGE_NULL) {
 178             object = top_page->object;
 179             vm_object_lock(object);
 180             VM_PAGE_FREE(top_page);
 181             vm_object_paging_end(object);
 182             vm_object_unlock(object);
 183         }
 184 }
 185
 186 #if     MACH_CLUSTER_STATS
 187 #define MAXCLUSTERPAGES 16
 188 struct {
 189         unsigned long pages_in_cluster;
 190         unsigned long pages_at_higher_offsets;
 191         unsigned long pages_at_lower_offsets;
 192 } cluster_stats_in[MAXCLUSTERPAGES];
 193 #define CLUSTER_STAT(clause)    clause
 194 #define CLUSTER_STAT_HIGHER(x)  \
 195         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 196 #define CLUSTER_STAT_LOWER(x)   \
 197          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 198 #define CLUSTER_STAT_CLUSTER(x) \
 199         ((cluster_stats_in[(x)].pages_in_cluster)++)
 200 #else   /* MACH_CLUSTER_STATS */
 201 #define CLUSTER_STAT(clause)
 202 #endif  /* MACH_CLUSTER_STATS */
 203
 204 /* XXX - temporary */
 205 boolean_t vm_allow_clustered_pagein = FALSE;
 206 int vm_pagein_cluster_used = 0;
 207
 208 /*
 209  * Prepage default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 210  */
 211 int vm_default_ahead = 1;       /* Number of pages to prepage ahead */
 212 int vm_default_behind = 0;      /* Number of pages to prepage behind */
 213
 214 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 215
 216 /*
 217  *      Routine:        vm_fault_page
 218  *      Purpose:
 219  *              Find the resident page for the virtual memory
 220  *              specified by the given virtual memory object
 221  *              and offset.
 222  *      Additional arguments:
 223  *              The required permissions for the page is given
 224  *              in "fault_type".  Desired permissions are included
 225  *              in "protection".  The minimum and maximum valid offsets
 226  *              within the object for the relevant map entry are
 227  *              passed in "lo_offset" and "hi_offset" respectively and
 228  *              the expected page reference pattern is passed in "behavior".
 229  *              These three parameters are used to determine pagein cluster
 230  *              limits.
 231  *
 232  *              If the desired page is known to be resident (for
 233  *              example, because it was previously wired down), asserting
 234  *              the "unwiring" parameter will speed the search.
 235  *
 236  *              If the operation can be interrupted (by thread_abort
 237  *              or thread_terminate), then the "interruptible"
 238  *              parameter should be asserted.
 239  *
 240  *      Results:
 241  *              The page containing the proper data is returned
 242  *              in "result_page".
 243  *
 244  *      In/out conditions:
 245  *              The source object must be locked and referenced,
 246  *              and must donate one paging reference.  The reference
 247  *              is not affected.  The paging reference and lock are
 248  *              consumed.
 249  *
 250  *              If the call succeeds, the object in which "result_page"
 251  *              resides is left locked and holding a paging reference.
 252  *              If this is not the original object, a busy page in the
 253  *              original object is returned in "top_page", to prevent other
 254  *              callers from pursuing this same data, along with a paging
 255  *              reference for the original object.  The "top_page" should
 256  *              be destroyed when this guarantee is no longer required.
 257  *              The "result_page" is also left busy.  It is not removed
 258  *              from the pageout queues.
 259  */
 260
 261 vm_fault_return_t
 262 vm_fault_page(
 263         /* Arguments: */
 264         vm_object_t     first_object,   /* Object to begin search */
 265         vm_object_offset_t first_offset,        /* Offset into object */
 266         vm_prot_t       fault_type,     /* What access is requested */
 267         boolean_t       must_be_resident,/* Must page be resident? */
 268         int             interruptible,  /* how may fault be interrupted? */
 269         vm_object_offset_t lo_offset,   /* Map entry start */
 270         vm_object_offset_t hi_offset,   /* Map entry end */
 271         vm_behavior_t   behavior,       /* Page reference behavior */
 272         /* Modifies in place: */
 273         vm_prot_t       *protection,    /* Protection for mapping */
 274         /* Returns: */
 275         vm_page_t       *result_page,   /* Page found, if successful */
 276         vm_page_t       *top_page,      /* Page in top object, if
 277                                          * not result_page.  */
 278         int             *type_of_fault, /* if non-null, fill in with type of fault
 279                                          * COW, zero-fill, etc... returned in trace point */
 280         /* More arguments: */
 281         kern_return_t   *error_code,    /* code if page is in error */
 282         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 283         boolean_t       data_supply,    /* treat as data_supply if
 284                                          * it is a write fault and a full
 285                                          * page is provided */
 286         vm_map_t        map,
 287         vm_offset_t     vaddr)
 288 {
 289         register
 290         vm_page_t               m;
 291         register
 292         vm_object_t             object;
 293         register
 294         vm_object_offset_t      offset;
 295         vm_page_t               first_m;
 296         vm_object_t             next_object;
 297         vm_object_t             copy_object;
 298         boolean_t               look_for_page;
 299         vm_prot_t               access_required = fault_type;
 300         vm_prot_t               wants_copy_flag;
 301         vm_size_t               cluster_size, length;
 302         vm_object_offset_t      cluster_offset;
 303         vm_object_offset_t      cluster_start, cluster_end, paging_offset;
 304         vm_object_offset_t      align_offset;
 305         CLUSTER_STAT(int pages_at_higher_offsets;)
 306         CLUSTER_STAT(int pages_at_lower_offsets;)
 307         kern_return_t   wait_result;
 308         boolean_t               interruptible_state;
 309         boolean_t               bumped_pagein = FALSE;
 310
 311
 312 #if     MACH_PAGEMAP
 313 /*
 314  * MACH page map - an optional optimization where a bit map is maintained
 315  * by the VM subsystem for internal objects to indicate which pages of
 316  * the object currently reside on backing store.  This existence map
 317  * duplicates information maintained by the vnode pager.  It is
 318  * created at the time of the first pageout against the object, i.e.
 319  * at the same time pager for the object is created.  The optimization
 320  * is designed to eliminate pager interaction overhead, if it is
 321  * 'known' that the page does not exist on backing store.
 322  *
 323  * LOOK_FOR() evaluates to TRUE if the page specified by object/offset is
 324  * either marked as paged out in the existence map for the object or no
 325  * existence map exists for the object.  LOOK_FOR() is one of the
 326  * criteria in the decision to invoke the pager.   It is also used as one
 327  * of the criteria to terminate the scan for adjacent pages in a clustered
 328  * pagein operation.  Note that LOOK_FOR() always evaluates to TRUE for
 329  * permanent objects.  Note also that if the pager for an internal object
 330  * has not been created, the pager is not invoked regardless of the value
 331  * of LOOK_FOR() and that clustered pagein scans are only done on an object
 332  * for which a pager has been created.
 333  *
 334  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 335  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 336  * PAGED_OUT() is used to determine if a page has already been pushed
 337  * into a copy object in order to avoid a redundant page out operation.
 338  */
 339 #define LOOK_FOR(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 340                         != VM_EXTERNAL_STATE_ABSENT)
 341 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 342                         == VM_EXTERNAL_STATE_EXISTS)
 343 #else /* MACH_PAGEMAP */
 344 /*
 345  * If the MACH page map optimization is not enabled,
 346  * LOOK_FOR() always evaluates to TRUE.  The pager will always be
 347  * invoked to resolve missing pages in an object, assuming the pager
 348  * has been created for the object.  In a clustered page operation, the
 349  * absence of a page on backing backing store cannot be used to terminate
 350  * a scan for adjacent pages since that information is available only in
 351  * the pager.  Hence pages that may not be paged out are potentially
 352  * included in a clustered request.  The vnode pager is coded to deal
 353  * with any combination of absent/present pages in a clustered
 354  * pagein request.  PAGED_OUT() always evaluates to FALSE, i.e. the pager
 355  * will always be invoked to push a dirty page into a copy object assuming
 356  * a pager has been created.  If the page has already been pushed, the
 357  * pager will ingore the new request.
 358  */
 359 #define LOOK_FOR(o, f) TRUE
 360 #define PAGED_OUT(o, f) FALSE
 361 #endif /* MACH_PAGEMAP */
 362
 363 /*
 364  *      Recovery actions
 365  */
 366 #define PREPARE_RELEASE_PAGE(m)                         \
 367         MACRO_BEGIN                                     \
 368         vm_page_lock_queues();                          \
 369         MACRO_END
 370
 371 #define DO_RELEASE_PAGE(m)                              \
 372         MACRO_BEGIN                                     \
 373         PAGE_WAKEUP_DONE(m);                            \
 374         if (!m->active && !m->inactive)                 \
 375                 vm_page_activate(m);                    \
 376         vm_page_unlock_queues();                        \
 377         MACRO_END
 378
 379 #define RELEASE_PAGE(m)                                 \
 380         MACRO_BEGIN                                     \
 381         PREPARE_RELEASE_PAGE(m);                        \
 382         DO_RELEASE_PAGE(m);                             \
 383         MACRO_END
 384
 385 #if TRACEFAULTPAGE
 386         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 387 #endif
 388
 389
 390
 391 #if     !VM_FAULT_STATIC_CONFIG
 392         if (vm_fault_dirty_handling
 393 #if     MACH_KDB
 394                 /*
 395                  *      If there are watchpoints set, then
 396                  *      we don't want to give away write permission
 397                  *      on a read fault.  Make the task write fault,
 398                  *      so that the watchpoint code notices the access.
 399                  */
 400             || db_watchpoint_list
 401 #endif  /* MACH_KDB */
 402             ) {
 403                 /*
 404                  *      If we aren't asking for write permission,
 405                  *      then don't give it away.  We're using write
 406                  *      faults to set the dirty bit.
 407                  */
 408                 if (!(fault_type & VM_PROT_WRITE))
 409                         *protection &= ~VM_PROT_WRITE;
 410         }
 411
 412         if (!vm_fault_interruptible)
 413                 interruptible = THREAD_UNINT;
 414 #else   /* STATIC_CONFIG */
 415 #if     MACH_KDB
 416                 /*
 417                  *      If there are watchpoints set, then
 418                  *      we don't want to give away write permission
 419                  *      on a read fault.  Make the task write fault,
 420                  *      so that the watchpoint code notices the access.
 421                  */
 422             if (db_watchpoint_list) {
 423                 /*
 424                  *      If we aren't asking for write permission,
 425                  *      then don't give it away.  We're using write
 426                  *      faults to set the dirty bit.
 427                  */
 428                 if (!(fault_type & VM_PROT_WRITE))
 429                         *protection &= ~VM_PROT_WRITE;
 430         }
 431
 432 #endif  /* MACH_KDB */
 433 #endif  /* STATIC_CONFIG */
 434
 435         interruptible_state = thread_interrupt_level(interruptible);
 436
 437         /*
 438          *      INVARIANTS (through entire routine):
 439          *
 440          *      1)      At all times, we must either have the object
 441          *              lock or a busy page in some object to prevent
 442          *              some other thread from trying to bring in
 443          *              the same page.
 444          *
 445          *              Note that we cannot hold any locks during the
 446          *              pager access or when waiting for memory, so
 447          *              we use a busy page then.
 448          *
 449          *              Note also that we aren't as concerned about more than
 450          *              one thread attempting to memory_object_data_unlock
 451          *              the same page at once, so we don't hold the page
 452          *              as busy then, but do record the highest unlock
 453          *              value so far.  [Unlock requests may also be delivered
 454          *              out of order.]
 455          *
 456          *      2)      To prevent another thread from racing us down the
 457          *              shadow chain and entering a new page in the top
 458          *              object before we do, we must keep a busy page in
 459          *              the top object while following the shadow chain.
 460          *
 461          *      3)      We must increment paging_in_progress on any object
 462          *              for which we have a busy page
 463          *
 464          *      4)      We leave busy pages on the pageout queues.
 465          *              If the pageout daemon comes across a busy page,
 466          *              it will remove the page from the pageout queues.
 467          */
 468
 469         /*
 470          *      Search for the page at object/offset.
 471          */
 472
 473         object = first_object;
 474         offset = first_offset;
 475         first_m = VM_PAGE_NULL;
 476         access_required = fault_type;
 477
 478         XPR(XPR_VM_FAULT,
 479                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 480                 (integer_t)object, offset, fault_type, *protection, 0);
 481
 482         /*
 483          *      See whether this page is resident
 484          */
 485
 486         while (TRUE) {
 487 #if TRACEFAULTPAGE
 488                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 489 #endif
 490                 if (!object->alive) {
 491                         vm_fault_cleanup(object, first_m);
 492                         thread_interrupt_level(interruptible_state);
 493                         return(VM_FAULT_MEMORY_ERROR);
 494                 }
 495                 m = vm_page_lookup(object, offset);
 496 #if TRACEFAULTPAGE
 497                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 498 #endif
 499                 if (m != VM_PAGE_NULL) {
 500                         /*
 501                          *      If the page was pre-paged as part of a
 502                          *      cluster, record the fact.
 503                          */
 504                         if (m->clustered) {
 505                                 vm_pagein_cluster_used++;
 506                                 m->clustered = FALSE;
 507                         }
 508
 509                         /*
 510                          *      If the page is being brought in,
 511                          *      wait for it and then retry.
 512                          *
 513                          *      A possible optimization: if the page
 514                          *      is known to be resident, we can ignore
 515                          *      pages that are absent (regardless of
 516                          *      whether they're busy).
 517                          */
 518
 519                         if (m->busy) {
 520 #if TRACEFAULTPAGE
 521                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 522 #endif
 523                                 wait_result = PAGE_SLEEP(object, m, interruptible);
 524                                 XPR(XPR_VM_FAULT,
 525                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 526                                         (integer_t)object, offset,
 527                                         (integer_t)m, 0, 0);
 528                                 counter(c_vm_fault_page_block_busy_kernel++);
 529
 530                                 if (wait_result != THREAD_AWAKENED) {
 531                                         vm_fault_cleanup(object, first_m);
 532                                         thread_interrupt_level(interruptible_state);
 533                                         if (wait_result == THREAD_RESTART)
 534                                           {
 535                                                 return(VM_FAULT_RETRY);
 536                                           }
 537                                         else
 538                                           {
 539                                                 return(VM_FAULT_INTERRUPTED);
 540                                           }
 541                                 }
 542                                 continue;
 543                         }
 544
 545                         /*
 546                          *      If the page is in error, give up now.
 547                          */
 548
 549                         if (m->error) {
 550 #if TRACEFAULTPAGE
 551                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
 552 #endif
 553                                 if (error_code)
 554                                         *error_code = m->page_error;
 555                                 VM_PAGE_FREE(m);
 556                                 vm_fault_cleanup(object, first_m);
 557                                 thread_interrupt_level(interruptible_state);
 558                                 return(VM_FAULT_MEMORY_ERROR);
 559                         }
 560
 561                         /*
 562                          *      If the pager wants us to restart
 563                          *      at the top of the chain,
 564                          *      typically because it has moved the
 565                          *      page to another pager, then do so.
 566                          */
 567
 568                         if (m->restart) {
 569 #if TRACEFAULTPAGE
 570                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 571 #endif
 572                                 VM_PAGE_FREE(m);
 573                                 vm_fault_cleanup(object, first_m);
 574                                 thread_interrupt_level(interruptible_state);
 575                                 return(VM_FAULT_RETRY);
 576                         }
 577
 578                         /*
 579                          *      If the page isn't busy, but is absent,
 580                          *      then it was deemed "unavailable".
 581                          */
 582
 583                         if (m->absent) {
 584                                 /*
 585                                  * Remove the non-existent page (unless it's
 586                                  * in the top object) and move on down to the
 587                                  * next object (if there is one).
 588                                  */
 589 #if TRACEFAULTPAGE
 590                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
 591 #endif
 592
 593                                 next_object = object->shadow;
 594                                 if (next_object == VM_OBJECT_NULL) {
 595                                         vm_page_t real_m;
 596
 597                                         assert(!must_be_resident);
 598
 599                                         if (object->shadow_severed) {
 600                                                 vm_fault_cleanup(
 601                                                         object, first_m);
 602                                                 thread_interrupt_level(interruptible_state);
 603                                                 return VM_FAULT_MEMORY_ERROR;
 604                                         }
 605
 606                                         /*
 607                                          * Absent page at bottom of shadow
 608                                          * chain; zero fill the page we left
 609                                          * busy in the first object, and flush
 610                                          * the absent page.  But first we
 611                                          * need to allocate a real page.
 612                                          */
 613                                         if (VM_PAGE_THROTTLED() ||
 614                                             (real_m = vm_page_grab()) == VM_PAGE_NULL) {
 615                                                 vm_fault_cleanup(object, first_m);
 616                                                 thread_interrupt_level(interruptible_state);
 617                                                 return(VM_FAULT_MEMORY_SHORTAGE);
 618                                         }
 619
 620
 621                                         XPR(XPR_VM_FAULT,
 622               "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
 623                                                 (integer_t)object, offset,
 624                                                 (integer_t)m,
 625                                                 (integer_t)first_object, 0);
 626                                         if (object != first_object) {
 627                                                 VM_PAGE_FREE(m);
 628                                                 vm_object_paging_end(object);
 629                                                 vm_object_unlock(object);
 630                                                 object = first_object;
 631                                                 offset = first_offset;
 632                                                 m = first_m;
 633                                                 first_m = VM_PAGE_NULL;
 634                                                 vm_object_lock(object);
 635                                         }
 636
 637                                         VM_PAGE_FREE(m);
 638                                         assert(real_m->busy);
 639                                         vm_page_insert(real_m, object, offset);
 640                                         m = real_m;
 641
 642                                         /*
 643                                          *  Drop the lock while zero filling
 644                                          *  page.  Then break because this
 645                                          *  is the page we wanted.  Checking
 646                                          *  the page lock is a waste of time;
 647                                          *  this page was either absent or
 648                                          *  newly allocated -- in both cases
 649                                          *  it can't be page locked by a pager.
 650                                          */
 651                                         m->no_isync = FALSE;
 652
 653                                         if (!no_zero_fill) {
 654                                                 vm_object_unlock(object);
 655                                                 vm_page_zero_fill(m);
 656                                                 if (type_of_fault)
 657                                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
 658                                                 VM_STAT(zero_fill_count++);
 659
 660                                                 if (bumped_pagein == TRUE) {
 661                                                         VM_STAT(pageins--);
 662                                                         current_task()->pageins--;
 663                                                 }
 664                                                 vm_object_lock(object);
 665                                         }
 666                                         pmap_clear_modify(m->phys_page);
 667                                         vm_page_lock_queues();
 668                                         VM_PAGE_QUEUES_REMOVE(m);
 669                                         m->page_ticket = vm_page_ticket;
 670                                         if(m->object->size > 0x80000) {
 671                                                 m->zero_fill = TRUE;
 672                                                 /* depends on the queues lock */
 673                                                 vm_zf_count += 1;
 674                                                 queue_enter(&vm_page_queue_zf,
 675                                                         m, vm_page_t, pageq);
 676                                         } else {
 677                                                 queue_enter(
 678                                                         &vm_page_queue_inactive,
 679                                                         m, vm_page_t, pageq);
 680                                         }
 681                                         vm_page_ticket_roll++;
 682                                         if(vm_page_ticket_roll ==
 683                                                 VM_PAGE_TICKETS_IN_ROLL) {
 684                                                 vm_page_ticket_roll = 0;
 685                                                 if(vm_page_ticket ==
 686                                                      VM_PAGE_TICKET_ROLL_IDS)
 687                                                         vm_page_ticket= 0;
 688                                                 else
 689                                                         vm_page_ticket++;
 690                                         }
 691                                         m->inactive = TRUE;
 692                                         vm_page_inactive_count++;
 693                                         vm_page_unlock_queues();
 694                                         break;
 695                                 } else {
 696                                         if (must_be_resident) {
 697                                                 vm_object_paging_end(object);
 698                                         } else if (object != first_object) {
 699                                                 vm_object_paging_end(object);
 700                                                 VM_PAGE_FREE(m);
 701                                         } else {
 702                                                 first_m = m;
 703                                                 m->absent = FALSE;
 704                                                 m->unusual = FALSE;
 705                                                 vm_object_absent_release(object);
 706                                                 m->busy = TRUE;
 707
 708                                                 vm_page_lock_queues();
 709                                                 VM_PAGE_QUEUES_REMOVE(m);
 710                                                 vm_page_unlock_queues();
 711                                         }
 712                                         XPR(XPR_VM_FAULT,
 713                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
 714                                                 (integer_t)object, offset,
 715                                                 (integer_t)next_object,
 716                                                 offset+object->shadow_offset,0);
 717                                         offset += object->shadow_offset;
 718                                         hi_offset += object->shadow_offset;
 719                                         lo_offset += object->shadow_offset;
 720                                         access_required = VM_PROT_READ;
 721                                         vm_object_lock(next_object);
 722                                         vm_object_unlock(object);
 723                                         object = next_object;
 724                                         vm_object_paging_begin(object);
 725                                         continue;
 726                                 }
 727                         }
 728
 729                         if ((m->cleaning)
 730                                 && ((object != first_object) ||
 731                                     (object->copy != VM_OBJECT_NULL))
 732                                 && (fault_type & VM_PROT_WRITE)) {
 733                                 /*
 734                                  * This is a copy-on-write fault that will
 735                                  * cause us to revoke access to this page, but
 736                                  * this page is in the process of being cleaned
 737                                  * in a clustered pageout. We must wait until
 738                                  * the cleaning operation completes before
 739                                  * revoking access to the original page,
 740                                  * otherwise we might attempt to remove a
 741                                  * wired mapping.
 742                                  */
 743 #if TRACEFAULTPAGE
 744                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
 745 #endif
 746                                 XPR(XPR_VM_FAULT,
 747                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
 748                                         (integer_t)object, offset,
 749                                         (integer_t)m, 0, 0);
 750                                 /* take an extra ref so that object won't die */
 751                                 assert(object->ref_count > 0);
 752                                 object->ref_count++;
 753                                 vm_object_res_reference(object);
 754                                 vm_fault_cleanup(object, first_m);
 755                                 counter(c_vm_fault_page_block_backoff_kernel++);
 756                                 vm_object_lock(object);
 757                                 assert(object->ref_count > 0);
 758                                 m = vm_page_lookup(object, offset);
 759                                 if (m != VM_PAGE_NULL && m->cleaning) {
 760                                         PAGE_ASSERT_WAIT(m, interruptible);
 761                                         vm_object_unlock(object);
 762                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
 763                                         vm_object_deallocate(object);
 764                                         goto backoff;
 765                                 } else {
 766                                         vm_object_unlock(object);
 767                                         vm_object_deallocate(object);
 768                                         thread_interrupt_level(interruptible_state);
 769                                         return VM_FAULT_RETRY;
 770                                 }
 771                         }
 772
 773                         /*
 774                          *      If the desired access to this page has
 775                          *      been locked out, request that it be unlocked.
 776                          */
 777
 778                         if (access_required & m->page_lock) {
 779                                 if ((access_required & m->unlock_request) != access_required) {
 780                                         vm_prot_t       new_unlock_request;
 781                                         kern_return_t   rc;
 782
 783 #if TRACEFAULTPAGE
 784                                         dbgTrace(0xBEEF000A, (unsigned int) m, (unsigned int) object->pager_ready);     /* (TEST/DEBUG) */
 785 #endif
 786                                         if (!object->pager_ready) {
 787                                         XPR(XPR_VM_FAULT,
 788                                             "vm_f_page: ready wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 789                                                 access_required,
 790                                                 (integer_t)object, offset,
 791                                                 (integer_t)m, 0);
 792                                                 /* take an extra ref */
 793                                                 assert(object->ref_count > 0);
 794                                                 object->ref_count++;
 795                                                 vm_object_res_reference(object);
 796                                                 vm_fault_cleanup(object,
 797                                                                  first_m);
 798                                                 counter(c_vm_fault_page_block_backoff_kernel++);
 799                                                 vm_object_lock(object);
 800                                                 assert(object->ref_count > 0);
 801                                                 if (!object->pager_ready) {
 802                                                         wait_result = vm_object_assert_wait(
 803                                                                 object,
 804                                                                 VM_OBJECT_EVENT_PAGER_READY,
 805                                                                 interruptible);
 806                                                         vm_object_unlock(object);
 807                                                         if (wait_result == THREAD_WAITING)
 808                                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
 809                                                         vm_object_deallocate(object);
 810                                                         goto backoff;
 811                                                 } else {
 812                                                         vm_object_unlock(object);
 813                                                         vm_object_deallocate(object);
 814                                                         thread_interrupt_level(interruptible_state);
 815                                                         return VM_FAULT_RETRY;
 816                                                 }
 817                                         }
 818
 819                                         new_unlock_request = m->unlock_request =
 820                                                 (access_required | m->unlock_request);
 821                                         vm_object_unlock(object);
 822                                         XPR(XPR_VM_FAULT,
 823                                             "vm_f_page: unlock obj 0x%X, offset 0x%X, page 0x%X, unl_req %d\n",
 824                                         (integer_t)object, offset,
 825                                         (integer_t)m, new_unlock_request, 0);
 826                                         if ((rc = memory_object_data_unlock(
 827                                                 object->pager,
 828                                                 offset + object->paging_offset,
 829                                                 PAGE_SIZE,
 830                                                 new_unlock_request))
 831                                              != KERN_SUCCESS) {
 832                                                 if (vm_fault_debug)
 833                                                     printf("vm_fault: memory_object_data_unlock failed\n");
 834                                                 vm_object_lock(object);
 835                                                 vm_fault_cleanup(object, first_m);
 836                                                 thread_interrupt_level(interruptible_state);
 837                                                 return((rc == MACH_SEND_INTERRUPTED) ?
 838                                                         VM_FAULT_INTERRUPTED :
 839                                                         VM_FAULT_MEMORY_ERROR);
 840                                         }
 841                                         vm_object_lock(object);
 842                                         continue;
 843                                 }
 844
 845                                 XPR(XPR_VM_FAULT,
 846         "vm_f_page: access wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 847                                         access_required, (integer_t)object,
 848                                         offset, (integer_t)m, 0);
 849                                 /* take an extra ref so object won't die */
 850                                 assert(object->ref_count > 0);
 851                                 object->ref_count++;
 852                                 vm_object_res_reference(object);
 853                                 vm_fault_cleanup(object, first_m);
 854                                 counter(c_vm_fault_page_block_backoff_kernel++);
 855                                 vm_object_lock(object);
 856                                 assert(object->ref_count > 0);
 857                                 m = vm_page_lookup(object, offset);
 858                                 if (m != VM_PAGE_NULL &&
 859                                     (access_required & m->page_lock) &&
 860                                     !((access_required & m->unlock_request) != access_required)) {
 861                                         PAGE_ASSERT_WAIT(m, interruptible);
 862                                         vm_object_unlock(object);
 863                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
 864                                         vm_object_deallocate(object);
 865                                         goto backoff;
 866                                 } else {
 867                                         vm_object_unlock(object);
 868                                         vm_object_deallocate(object);
 869                                         thread_interrupt_level(interruptible_state);
 870                                         return VM_FAULT_RETRY;
 871                                 }
 872                         }
 873                         /*
 874                          *      We mark the page busy and leave it on
 875                          *      the pageout queues.  If the pageout
 876                          *      deamon comes across it, then it will
 877                          *      remove the page.
 878                          */
 879
 880 #if TRACEFAULTPAGE
 881                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 882 #endif
 883
 884 #if     !VM_FAULT_STATIC_CONFIG
 885                         if (!software_reference_bits) {
 886                                 vm_page_lock_queues();
 887                                 if (m->inactive)
 888                                         vm_stat.reactivations++;
 889
 890                                 VM_PAGE_QUEUES_REMOVE(m);
 891                                 vm_page_unlock_queues();
 892                         }
 893 #endif
 894                         XPR(XPR_VM_FAULT,
 895                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
 896                                 (integer_t)object, offset, (integer_t)m, 0, 0);
 897                         assert(!m->busy);
 898                         m->busy = TRUE;
 899                         assert(!m->absent);
 900                         break;
 901                 }
 902
 903                 look_for_page =
 904                         (object->pager_created) &&
 905                           LOOK_FOR(object, offset) &&
 906                             (!data_supply);
 907
 908 #if TRACEFAULTPAGE
 909                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
 910 #endif
 911                 if ((look_for_page || (object == first_object))
 912                                 && !must_be_resident
 913                                 && !(object->phys_contiguous))  {
 914                         /*
 915                          *      Allocate a new page for this object/offset
 916                          *      pair.
 917                          */
 918
 919                         m = vm_page_grab_fictitious();
 920 #if TRACEFAULTPAGE
 921                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 922 #endif
 923                         if (m == VM_PAGE_NULL) {
 924                                 vm_fault_cleanup(object, first_m);
 925                                 thread_interrupt_level(interruptible_state);
 926                                 return(VM_FAULT_FICTITIOUS_SHORTAGE);
 927                         }
 928                         vm_page_insert(m, object, offset);
 929                 }
 930
 931                 if ((look_for_page && !must_be_resident)) {
 932                         kern_return_t   rc;
 933
 934                         /*
 935                          *      If the memory manager is not ready, we
 936                          *      cannot make requests.
 937                          */
 938                         if (!object->pager_ready) {
 939 #if TRACEFAULTPAGE
 940                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 941 #endif
 942                                 if(m != VM_PAGE_NULL)
 943                                         VM_PAGE_FREE(m);
 944                                 XPR(XPR_VM_FAULT,
 945                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
 946                                         (integer_t)object, offset, 0, 0, 0);
 947                                 /* take an extra ref so object won't die */
 948                                 assert(object->ref_count > 0);
 949                                 object->ref_count++;
 950                                 vm_object_res_reference(object);
 951                                 vm_fault_cleanup(object, first_m);
 952                                 counter(c_vm_fault_page_block_backoff_kernel++);
 953                                 vm_object_lock(object);
 954                                 assert(object->ref_count > 0);
 955                                 if (!object->pager_ready) {
 956                                         wait_result = vm_object_assert_wait(object,
 957                                                               VM_OBJECT_EVENT_PAGER_READY,
 958                                                               interruptible);
 959                                         vm_object_unlock(object);
 960                                         if (wait_result == THREAD_WAITING)
 961                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
 962                                         vm_object_deallocate(object);
 963                                         goto backoff;
 964                                 } else {
 965                                         vm_object_unlock(object);
 966                                         vm_object_deallocate(object);
 967                                         thread_interrupt_level(interruptible_state);
 968                                         return VM_FAULT_RETRY;
 969                                 }
 970                         }
 971
 972                         if(object->phys_contiguous) {
 973                                 if(m != VM_PAGE_NULL) {
 974                                         VM_PAGE_FREE(m);
 975                                         m = VM_PAGE_NULL;
 976                                 }
 977                                 goto no_clustering;
 978                         }
 979                         if (object->internal) {
 980                                 /*
 981                                  *      Requests to the default pager
 982                                  *      must reserve a real page in advance,
 983                                  *      because the pager's data-provided
 984                                  *      won't block for pages.  IMPORTANT:
 985                                  *      this acts as a throttling mechanism
 986                                  *      for data_requests to the default
 987                                  *      pager.
 988                                  */
 989
 990 #if TRACEFAULTPAGE
 991                                 dbgTrace(0xBEEF000F, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 992 #endif
 993                                 if (m->fictitious && !vm_page_convert(m)) {
 994                                         VM_PAGE_FREE(m);
 995                                         vm_fault_cleanup(object, first_m);
 996                                         thread_interrupt_level(interruptible_state);
 997                                         return(VM_FAULT_MEMORY_SHORTAGE);
 998                                 }
 999                         } else if (object->absent_count >
1000                                                 vm_object_absent_max) {
1001                                 /*
1002                                  *      If there are too many outstanding page
1003                                  *      requests pending on this object, we
1004                                  *      wait for them to be resolved now.
1005                                  */
1006
1007 #if TRACEFAULTPAGE
1008                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1009 #endif
1010                                 if(m != VM_PAGE_NULL)
1011                                         VM_PAGE_FREE(m);
1012                                 /* take an extra ref so object won't die */
1013                                 assert(object->ref_count > 0);
1014                                 object->ref_count++;
1015                                 vm_object_res_reference(object);
1016                                 vm_fault_cleanup(object, first_m);
1017                                 counter(c_vm_fault_page_block_backoff_kernel++);
1018                                 vm_object_lock(object);
1019                                 assert(object->ref_count > 0);
1020                                 if (object->absent_count > vm_object_absent_max) {
1021                                         vm_object_absent_assert_wait(object,
1022                                                                      interruptible);
1023                                         vm_object_unlock(object);
1024                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1025                                         vm_object_deallocate(object);
1026                                         goto backoff;
1027                                 } else {
1028                                         vm_object_unlock(object);
1029                                         vm_object_deallocate(object);
1030                                         thread_interrupt_level(interruptible_state);
1031                                         return VM_FAULT_RETRY;
1032                                 }
1033                         }
1034
1035                         /*
1036                          *      Indicate that the page is waiting for data
1037                          *      from the memory manager.
1038                          */
1039
1040                         if(m != VM_PAGE_NULL) {
1041
1042                                 m->list_req_pending = TRUE;
1043                                 m->absent = TRUE;
1044                                 m->unusual = TRUE;
1045                                 object->absent_count++;
1046
1047                         }
1048
1049 no_clustering:
1050                         cluster_start = offset;
1051                         length = PAGE_SIZE;
1052
1053                         /*
1054                          * lengthen the cluster by the pages in the working set
1055                          */
1056                         if((map != NULL) &&
1057                                 (current_task()->dynamic_working_set != 0)) {
1058                                 cluster_end = cluster_start + length;
1059                                 /* tws values for start and end are just a
1060                                  * suggestions.  Therefore, as long as
1061                                  * build_cluster does not use pointers or
1062                                  * take action based on values that
1063                                  * could be affected by re-entrance we
1064                                  * do not need to take the map lock.
1065                                  */
1066                                 cluster_end = offset + PAGE_SIZE_64;
1067                                 tws_build_cluster((tws_hash_t)
1068                                         current_task()->dynamic_working_set,
1069                                         object, &cluster_start,
1070                                         &cluster_end, 0x40000);
1071                                 length = cluster_end - cluster_start;
1072                         }
1073 #if TRACEFAULTPAGE
1074                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1075 #endif
1076                         /*
1077                          *      We have a busy page, so we can
1078                          *      release the object lock.
1079                          */
1080                         vm_object_unlock(object);
1081
1082                         /*
1083                          *      Call the memory manager to retrieve the data.
1084                          */
1085
1086                         if (type_of_fault)
1087                                 *type_of_fault = (length << 8) | DBG_PAGEIN_FAULT;
1088                         VM_STAT(pageins++);
1089                         current_task()->pageins++;
1090                         bumped_pagein = TRUE;
1091
1092                         /*
1093                          *      If this object uses a copy_call strategy,
1094                          *      and we are interested in a copy of this object
1095                          *      (having gotten here only by following a
1096                          *      shadow chain), then tell the memory manager
1097                          *      via a flag added to the desired_access
1098                          *      parameter, so that it can detect a race
1099                          *      between our walking down the shadow chain
1100                          *      and its pushing pages up into a copy of
1101                          *      the object that it manages.
1102                          */
1103
1104                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL &&
1105                             object != first_object) {
1106                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1107                         } else {
1108                                 wants_copy_flag = VM_PROT_NONE;
1109                         }
1110
1111                         XPR(XPR_VM_FAULT,
1112                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1113                                 (integer_t)object, offset, (integer_t)m,
1114                                 access_required | wants_copy_flag, 0);
1115
1116                         rc = memory_object_data_request(object->pager,
1117                                         cluster_start + object->paging_offset,
1118                                         length,
1119                                         access_required | wants_copy_flag);
1120
1121
1122 #if TRACEFAULTPAGE
1123                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1124 #endif
1125                         if (rc != KERN_SUCCESS) {
1126                                 if (rc != MACH_SEND_INTERRUPTED
1127                                     && vm_fault_debug)
1128                                         printf("%s(0x%x, 0x%x, 0x%x, 0x%x) failed, rc=%d\n",
1129                                                 "memory_object_data_request",
1130                                                 object->pager,
1131                                                 cluster_start + object->paging_offset,
1132                                                 length, access_required, rc);
1133                                 /*
1134                                  *      Don't want to leave a busy page around,
1135                                  *      but the data request may have blocked,
1136                                  *      so check if it's still there and busy.
1137                                  */
1138                                 if(!object->phys_contiguous) {
1139                                    vm_object_lock(object);
1140                                    for (; length; length -= PAGE_SIZE,
1141                                       cluster_start += PAGE_SIZE_64) {
1142                                       vm_page_t p;
1143                                       if ((p = vm_page_lookup(object,
1144                                                                 cluster_start))
1145                                             && p->absent && p->busy
1146                                             && p != first_m) {
1147                                          VM_PAGE_FREE(p);
1148                                       }
1149                                    }
1150                                 }
1151                                 vm_fault_cleanup(object, first_m);
1152                                 thread_interrupt_level(interruptible_state);
1153                                 return((rc == MACH_SEND_INTERRUPTED) ?
1154                                         VM_FAULT_INTERRUPTED :
1155                                         VM_FAULT_MEMORY_ERROR);
1156                         } else {
1157 #ifdef notdefcdy
1158                                 tws_hash_line_t line;
1159                                 task_t          task;
1160
1161                                 task = current_task();
1162
1163                                 if((map != NULL) &&
1164                                         (task->dynamic_working_set != 0))
1165                                                 && !(object->private)) {
1166                                         vm_object_t     base_object;
1167                                         vm_object_offset_t base_offset;
1168                                         base_object = object;
1169                                         base_offset = offset;
1170                                         while(base_object->shadow) {
1171                                                 base_offset +=
1172                                                   base_object->shadow_offset;
1173                                                 base_object =
1174                                                   base_object->shadow;
1175                                         }
1176                                         if(tws_lookup
1177                                                 ((tws_hash_t)
1178                                                 task->dynamic_working_set,
1179                                                 base_offset, base_object,
1180                                                 &line) == KERN_SUCCESS) {
1181                                                 tws_line_signal((tws_hash_t)
1182                                                 task->dynamic_working_set,
1183                                                         map, line, vaddr);
1184                                         }
1185                                 }
1186 #endif
1187                         }
1188
1189                         /*
1190                          * Retry with same object/offset, since new data may
1191                          * be in a different page (i.e., m is meaningless at
1192                          * this point).
1193                          */
1194                         vm_object_lock(object);
1195                         if ((interruptible != THREAD_UNINT) &&
1196                             (current_thread()->state & TH_ABORT)) {
1197                                 vm_fault_cleanup(object, first_m);
1198                                 thread_interrupt_level(interruptible_state);
1199                                 return(VM_FAULT_INTERRUPTED);
1200                         }
1201                         if(m == VM_PAGE_NULL)
1202                                 break;
1203                         continue;
1204                 }
1205
1206                 /*
1207                  * The only case in which we get here is if
1208                  * object has no pager (or unwiring).  If the pager doesn't
1209                  * have the page this is handled in the m->absent case above
1210                  * (and if you change things here you should look above).
1211                  */
1212 #if TRACEFAULTPAGE
1213                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1214 #endif
1215                 if (object == first_object)
1216                         first_m = m;
1217                 else
1218                         assert(m == VM_PAGE_NULL);
1219
1220                 XPR(XPR_VM_FAULT,
1221                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1222                         (integer_t)object, offset, (integer_t)m,
1223                         (integer_t)object->shadow, 0);
1224                 /*
1225                  *      Move on to the next object.  Lock the next
1226                  *      object before unlocking the current one.
1227                  */
1228                 next_object = object->shadow;
1229                 if (next_object == VM_OBJECT_NULL) {
1230                         assert(!must_be_resident);
1231                         /*
1232                          *      If there's no object left, fill the page
1233                          *      in the top object with zeros.  But first we
1234                          *      need to allocate a real page.
1235                          */
1236
1237                         if (object != first_object) {
1238                                 vm_object_paging_end(object);
1239                                 vm_object_unlock(object);
1240
1241                                 object = first_object;
1242                                 offset = first_offset;
1243                                 vm_object_lock(object);
1244                         }
1245
1246                         m = first_m;
1247                         assert(m->object == object);
1248                         first_m = VM_PAGE_NULL;
1249
1250                         if (object->shadow_severed) {
1251                                 VM_PAGE_FREE(m);
1252                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1253                                 thread_interrupt_level(interruptible_state);
1254                                 return VM_FAULT_MEMORY_ERROR;
1255                         }
1256
1257                         if (VM_PAGE_THROTTLED() ||
1258                             (m->fictitious && !vm_page_convert(m))) {
1259                                 VM_PAGE_FREE(m);
1260                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1261                                 thread_interrupt_level(interruptible_state);
1262                                 return(VM_FAULT_MEMORY_SHORTAGE);
1263                         }
1264                         m->no_isync = FALSE;
1265
1266                         if (!no_zero_fill) {
1267                                 vm_object_unlock(object);
1268                                 vm_page_zero_fill(m);
1269                                 if (type_of_fault)
1270                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
1271                                 VM_STAT(zero_fill_count++);
1272
1273                                 if (bumped_pagein == TRUE) {
1274                                         VM_STAT(pageins--);
1275                                         current_task()->pageins--;
1276                                 }
1277                                 vm_object_lock(object);
1278                         }
1279                         vm_page_lock_queues();
1280                         VM_PAGE_QUEUES_REMOVE(m);
1281                         if(m->object->size > 0x80000) {
1282                                 m->zero_fill = TRUE;
1283                                 /* depends on the queues lock */
1284                                 vm_zf_count += 1;
1285                                 queue_enter(&vm_page_queue_zf,
1286                                         m, vm_page_t, pageq);
1287                         } else {
1288                                 queue_enter(
1289                                         &vm_page_queue_inactive,
1290                                         m, vm_page_t, pageq);
1291                         }
1292                         m->page_ticket = vm_page_ticket;
1293                         vm_page_ticket_roll++;
1294                         if(vm_page_ticket_roll == VM_PAGE_TICKETS_IN_ROLL) {
1295                                 vm_page_ticket_roll = 0;
1296                                 if(vm_page_ticket ==
1297                                         VM_PAGE_TICKET_ROLL_IDS)
1298                                         vm_page_ticket= 0;
1299                                 else
1300                                         vm_page_ticket++;
1301                         }
1302                         m->inactive = TRUE;
1303                         vm_page_inactive_count++;
1304                         vm_page_unlock_queues();
1305                         pmap_clear_modify(m->phys_page);
1306                         break;
1307                 }
1308                 else {
1309                         if ((object != first_object) || must_be_resident)
1310                                 vm_object_paging_end(object);
1311                         offset += object->shadow_offset;
1312                         hi_offset += object->shadow_offset;
1313                         lo_offset += object->shadow_offset;
1314                         access_required = VM_PROT_READ;
1315                         vm_object_lock(next_object);
1316                         vm_object_unlock(object);
1317                         object = next_object;
1318                         vm_object_paging_begin(object);
1319                 }
1320         }
1321
1322         /*
1323          *      PAGE HAS BEEN FOUND.
1324          *
1325          *      This page (m) is:
1326          *              busy, so that we can play with it;
1327          *              not absent, so that nobody else will fill it;
1328          *              possibly eligible for pageout;
1329          *
1330          *      The top-level page (first_m) is:
1331          *              VM_PAGE_NULL if the page was found in the
1332          *               top-level object;
1333          *              busy, not absent, and ineligible for pageout.
1334          *
1335          *      The current object (object) is locked.  A paging
1336          *      reference is held for the current and top-level
1337          *      objects.
1338          */
1339
1340 #if TRACEFAULTPAGE
1341         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1342 #endif
1343 #if     EXTRA_ASSERTIONS
1344         if(m != VM_PAGE_NULL) {
1345                 assert(m->busy && !m->absent);
1346                 assert((first_m == VM_PAGE_NULL) ||
1347                         (first_m->busy && !first_m->absent &&
1348                          !first_m->active && !first_m->inactive));
1349         }
1350 #endif  /* EXTRA_ASSERTIONS */
1351
1352         XPR(XPR_VM_FAULT,
1353        "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1354                 (integer_t)object, offset, (integer_t)m,
1355                 (integer_t)first_object, (integer_t)first_m);
1356         /*
1357          *      If the page is being written, but isn't
1358          *      already owned by the top-level object,
1359          *      we have to copy it into a new page owned
1360          *      by the top-level object.
1361          */
1362
1363         if ((object != first_object) && (m != VM_PAGE_NULL)) {
1364                 /*
1365                  *      We only really need to copy if we
1366                  *      want to write it.
1367                  */
1368
1369 #if TRACEFAULTPAGE
1370                         dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1371 #endif
1372                 if (fault_type & VM_PROT_WRITE) {
1373                         vm_page_t copy_m;
1374
1375                         assert(!must_be_resident);
1376
1377                         /*
1378                          *      If we try to collapse first_object at this
1379                          *      point, we may deadlock when we try to get
1380                          *      the lock on an intermediate object (since we
1381                          *      have the bottom object locked).  We can't
1382                          *      unlock the bottom object, because the page
1383                          *      we found may move (by collapse) if we do.
1384                          *
1385                          *      Instead, we first copy the page.  Then, when
1386                          *      we have no more use for the bottom object,
1387                          *      we unlock it and try to collapse.
1388                          *
1389                          *      Note that we copy the page even if we didn't
1390                          *      need to... that's the breaks.
1391                          */
1392
1393                         /*
1394                          *      Allocate a page for the copy
1395                          */
1396                         copy_m = vm_page_grab();
1397                         if (copy_m == VM_PAGE_NULL) {
1398                                 RELEASE_PAGE(m);
1399                                 vm_fault_cleanup(object, first_m);
1400                                 thread_interrupt_level(interruptible_state);
1401                                 return(VM_FAULT_MEMORY_SHORTAGE);
1402                         }
1403
1404
1405                         XPR(XPR_VM_FAULT,
1406                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1407                                 (integer_t)object, offset,
1408                                 (integer_t)m, (integer_t)copy_m, 0);
1409                         vm_page_copy(m, copy_m);
1410
1411                         /*
1412                          *      If another map is truly sharing this
1413                          *      page with us, we have to flush all
1414                          *      uses of the original page, since we
1415                          *      can't distinguish those which want the
1416                          *      original from those which need the
1417                          *      new copy.
1418                          *
1419                          *      XXXO If we know that only one map has
1420                          *      access to this page, then we could
1421                          *      avoid the pmap_page_protect() call.
1422                          */
1423
1424                         vm_page_lock_queues();
1425                         assert(!m->cleaning);
1426                         pmap_page_protect(m->phys_page, VM_PROT_NONE);
1427                         vm_page_deactivate(m);
1428                         copy_m->dirty = TRUE;
1429                         /*
1430                          * Setting reference here prevents this fault from
1431                          * being counted as a (per-thread) reactivate as well
1432                          * as a copy-on-write.
1433                          */
1434                         first_m->reference = TRUE;
1435                         vm_page_unlock_queues();
1436
1437                         /*
1438                          *      We no longer need the old page or object.
1439                          */
1440
1441                         PAGE_WAKEUP_DONE(m);
1442                         vm_object_paging_end(object);
1443                         vm_object_unlock(object);
1444
1445                         if (type_of_fault)
1446                                 *type_of_fault = DBG_COW_FAULT;
1447                         VM_STAT(cow_faults++);
1448                         current_task()->cow_faults++;
1449                         object = first_object;
1450                         offset = first_offset;
1451
1452                         vm_object_lock(object);
1453                         VM_PAGE_FREE(first_m);
1454                         first_m = VM_PAGE_NULL;
1455                         assert(copy_m->busy);
1456                         vm_page_insert(copy_m, object, offset);
1457                         m = copy_m;
1458
1459                         /*
1460                          *      Now that we've gotten the copy out of the
1461                          *      way, let's try to collapse the top object.
1462                          *      But we have to play ugly games with
1463                          *      paging_in_progress to do that...
1464                          */
1465
1466                         vm_object_paging_end(object);
1467                         vm_object_collapse(object);
1468                         vm_object_paging_begin(object);
1469
1470                 }
1471                 else {
1472                         *protection &= (~VM_PROT_WRITE);
1473                 }
1474         }
1475
1476         /*
1477          *      Now check whether the page needs to be pushed into the
1478          *      copy object.  The use of asymmetric copy on write for
1479          *      shared temporary objects means that we may do two copies to
1480          *      satisfy the fault; one above to get the page from a
1481          *      shadowed object, and one here to push it into the copy.
1482          */
1483
1484         while ((copy_object = first_object->copy) != VM_OBJECT_NULL &&
1485                    (m!= VM_PAGE_NULL)) {
1486                 vm_object_offset_t      copy_offset;
1487                 vm_page_t               copy_m;
1488
1489 #if TRACEFAULTPAGE
1490                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1491 #endif
1492                 /*
1493                  *      If the page is being written, but hasn't been
1494                  *      copied to the copy-object, we have to copy it there.
1495                  */
1496
1497                 if ((fault_type & VM_PROT_WRITE) == 0) {
1498                         *protection &= ~VM_PROT_WRITE;
1499                         break;
1500                 }
1501
1502                 /*
1503                  *      If the page was guaranteed to be resident,
1504                  *      we must have already performed the copy.
1505                  */
1506
1507                 if (must_be_resident)
1508                         break;
1509
1510                 /*
1511                  *      Try to get the lock on the copy_object.
1512                  */
1513                 if (!vm_object_lock_try(copy_object)) {
1514                         vm_object_unlock(object);
1515
1516                         mutex_pause();  /* wait a bit */
1517
1518                         vm_object_lock(object);
1519                         continue;
1520                 }
1521
1522                 /*
1523                  *      Make another reference to the copy-object,
1524                  *      to keep it from disappearing during the
1525                  *      copy.
1526                  */
1527                 assert(copy_object->ref_count > 0);
1528                 copy_object->ref_count++;
1529                 VM_OBJ_RES_INCR(copy_object);
1530
1531                 /*
1532                  *      Does the page exist in the copy?
1533                  */
1534                 copy_offset = first_offset - copy_object->shadow_offset;
1535                 if (copy_object->size <= copy_offset)
1536                         /*
1537                          * Copy object doesn't cover this page -- do nothing.
1538                          */
1539                         ;
1540                 else if ((copy_m =
1541                         vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1542                         /* Page currently exists in the copy object */
1543                         if (copy_m->busy) {
1544                                 /*
1545                                  *      If the page is being brought
1546                                  *      in, wait for it and then retry.
1547                                  */
1548                                 RELEASE_PAGE(m);
1549                                 /* take an extra ref so object won't die */
1550                                 assert(copy_object->ref_count > 0);
1551                                 copy_object->ref_count++;
1552                                 vm_object_res_reference(copy_object);
1553                                 vm_object_unlock(copy_object);
1554                                 vm_fault_cleanup(object, first_m);
1555                                 counter(c_vm_fault_page_block_backoff_kernel++);
1556                                 vm_object_lock(copy_object);
1557                                 assert(copy_object->ref_count > 0);
1558                                 VM_OBJ_RES_DECR(copy_object);
1559                                 copy_object->ref_count--;
1560                                 assert(copy_object->ref_count > 0);
1561                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1562                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1563                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1564                                         vm_object_unlock(copy_object);
1565                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1566                                         vm_object_deallocate(copy_object);
1567                                         goto backoff;
1568                                 } else {
1569                                         vm_object_unlock(copy_object);
1570                                         vm_object_deallocate(copy_object);
1571                                         thread_interrupt_level(interruptible_state);
1572                                         return VM_FAULT_RETRY;
1573                                 }
1574                         }
1575                 }
1576                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1577                         /*
1578                          * If PAGED_OUT is TRUE, then the page used to exist
1579                          * in the copy-object, and has already been paged out.
1580                          * We don't need to repeat this. If PAGED_OUT is
1581                          * FALSE, then either we don't know (!pager_created,
1582                          * for example) or it hasn't been paged out.
1583                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1584                          * We must copy the page to the copy object.
1585                          */
1586
1587                         /*
1588                          *      Allocate a page for the copy
1589                          */
1590                         copy_m = vm_page_alloc(copy_object, copy_offset);
1591                         if (copy_m == VM_PAGE_NULL) {
1592                                 RELEASE_PAGE(m);
1593                                 VM_OBJ_RES_DECR(copy_object);
1594                                 copy_object->ref_count--;
1595                                 assert(copy_object->ref_count > 0);
1596                                 vm_object_unlock(copy_object);
1597                                 vm_fault_cleanup(object, first_m);
1598                                 thread_interrupt_level(interruptible_state);
1599                                 return(VM_FAULT_MEMORY_SHORTAGE);
1600                         }
1601
1602                         /*
1603                          *      Must copy page into copy-object.
1604                          */
1605
1606                         vm_page_copy(m, copy_m);
1607
1608                         /*
1609                          *      If the old page was in use by any users
1610                          *      of the copy-object, it must be removed
1611                          *      from all pmaps.  (We can't know which
1612                          *      pmaps use it.)
1613                          */
1614
1615                         vm_page_lock_queues();
1616                         assert(!m->cleaning);
1617                         pmap_page_protect(m->phys_page, VM_PROT_NONE);
1618                         copy_m->dirty = TRUE;
1619                         vm_page_unlock_queues();
1620
1621                         /*
1622                          *      If there's a pager, then immediately
1623                          *      page out this page, using the "initialize"
1624                          *      option.  Else, we use the copy.
1625                          */
1626
1627                         if
1628 #if     MACH_PAGEMAP
1629                           ((!copy_object->pager_created) ||
1630                                 vm_external_state_get(
1631                                         copy_object->existence_map, copy_offset)
1632                                 == VM_EXTERNAL_STATE_ABSENT)
1633 #else
1634                           (!copy_object->pager_created)
1635 #endif
1636                                 {
1637                                 vm_page_lock_queues();
1638                                 vm_page_activate(copy_m);
1639                                 vm_page_unlock_queues();
1640                                 PAGE_WAKEUP_DONE(copy_m);
1641                         }
1642                         else {
1643                                 assert(copy_m->busy == TRUE);
1644
1645                                 /*
1646                                  *      The page is already ready for pageout:
1647                                  *      not on pageout queues and busy.
1648                                  *      Unlock everything except the
1649                                  *      copy_object itself.
1650                                  */
1651
1652                                 vm_object_unlock(object);
1653
1654                                 /*
1655                                  *      Write the page to the copy-object,
1656                                  *      flushing it from the kernel.
1657                                  */
1658
1659                                 vm_pageout_initialize_page(copy_m);
1660
1661                                 /*
1662                                  *      Since the pageout may have
1663                                  *      temporarily dropped the
1664                                  *      copy_object's lock, we
1665                                  *      check whether we'll have
1666                                  *      to deallocate the hard way.
1667                                  */
1668
1669                                 if ((copy_object->shadow != object) ||
1670                                     (copy_object->ref_count == 1)) {
1671                                         vm_object_unlock(copy_object);
1672                                         vm_object_deallocate(copy_object);
1673                                         vm_object_lock(object);
1674                                         continue;
1675                                 }
1676
1677                                 /*
1678                                  *      Pick back up the old object's
1679                                  *      lock.  [It is safe to do so,
1680                                  *      since it must be deeper in the
1681                                  *      object tree.]
1682                                  */
1683
1684                                 vm_object_lock(object);
1685                         }
1686
1687                         /*
1688                          *      Because we're pushing a page upward
1689                          *      in the object tree, we must restart
1690                          *      any faults that are waiting here.
1691                          *      [Note that this is an expansion of
1692                          *      PAGE_WAKEUP that uses the THREAD_RESTART
1693                          *      wait result].  Can't turn off the page's
1694                          *      busy bit because we're not done with it.
1695                          */
1696
1697                         if (m->wanted) {
1698                                 m->wanted = FALSE;
1699                                 thread_wakeup_with_result((event_t) m,
1700                                         THREAD_RESTART);
1701                         }
1702                 }
1703
1704                 /*
1705                  *      The reference count on copy_object must be
1706                  *      at least 2: one for our extra reference,
1707                  *      and at least one from the outside world
1708                  *      (we checked that when we last locked
1709                  *      copy_object).
1710                  */
1711                 copy_object->ref_count--;
1712                 assert(copy_object->ref_count > 0);
1713                 VM_OBJ_RES_DECR(copy_object);
1714                 vm_object_unlock(copy_object);
1715
1716                 break;
1717         }
1718
1719         *result_page = m;
1720         *top_page = first_m;
1721
1722         XPR(XPR_VM_FAULT,
1723                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1724                 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1725         /*
1726          *      If the page can be written, assume that it will be.
1727          *      [Earlier, we restrict the permission to allow write
1728          *      access only if the fault so required, so we don't
1729          *      mark read-only data as dirty.]
1730          */
1731
1732 #if     !VM_FAULT_STATIC_CONFIG
1733         if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE) &&
1734                         (m != VM_PAGE_NULL)) {
1735                 m->dirty = TRUE;
1736         }
1737 #endif
1738 #if TRACEFAULTPAGE
1739         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_page_deactivate_behind);  /* (TEST/DEBUG) */
1740 #endif
1741         if (vm_page_deactivate_behind) {
1742                 if (offset && /* don't underflow */
1743                         (object->last_alloc == (offset - PAGE_SIZE_64))) {
1744                         m = vm_page_lookup(object, object->last_alloc);
1745                         if ((m != VM_PAGE_NULL) && !m->busy) {
1746                                 vm_page_lock_queues();
1747                                 vm_page_deactivate(m);
1748                                 vm_page_unlock_queues();
1749                         }
1750 #if TRACEFAULTPAGE
1751                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1752 #endif
1753                 }
1754                 object->last_alloc = offset;
1755         }
1756 #if TRACEFAULTPAGE
1757         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
1758 #endif
1759         thread_interrupt_level(interruptible_state);
1760         if(*result_page == VM_PAGE_NULL) {
1761                 vm_object_unlock(object);
1762         }
1763         return(VM_FAULT_SUCCESS);
1764
1765 #if 0
1766     block_and_backoff:
1767         vm_fault_cleanup(object, first_m);
1768
1769         counter(c_vm_fault_page_block_backoff_kernel++);
1770         thread_block(THREAD_CONTINUE_NULL);
1771 #endif
1772
1773     backoff:
1774         thread_interrupt_level(interruptible_state);
1775         if (wait_result == THREAD_INTERRUPTED)
1776                 return VM_FAULT_INTERRUPTED;
1777         return VM_FAULT_RETRY;
1778
1779 #undef  RELEASE_PAGE
1780 }
1781
1782 /*
1783  *      Routine:        vm_fault
1784  *      Purpose:
1785  *              Handle page faults, including pseudo-faults
1786  *              used to change the wiring status of pages.
1787  *      Returns:
1788  *              Explicit continuations have been removed.
1789  *      Implementation:
1790  *              vm_fault and vm_fault_page save mucho state
1791  *              in the moral equivalent of a closure.  The state
1792  *              structure is allocated when first entering vm_fault
1793  *              and deallocated when leaving vm_fault.
1794  */
1795
1796 kern_return_t
1797 vm_fault(
1798         vm_map_t        map,
1799         vm_offset_t     vaddr,
1800         vm_prot_t       fault_type,
1801         boolean_t       change_wiring,
1802         int             interruptible,
1803         pmap_t          caller_pmap,
1804         vm_offset_t     caller_pmap_addr)
1805 {
1806         vm_map_version_t        version;        /* Map version for verificiation */
1807         boolean_t               wired;          /* Should mapping be wired down? */
1808         vm_object_t             object;         /* Top-level object */
1809         vm_object_offset_t      offset;         /* Top-level offset */
1810         vm_prot_t               prot;           /* Protection for mapping */
1811         vm_behavior_t           behavior;       /* Expected paging behavior */
1812         vm_object_offset_t      lo_offset, hi_offset;
1813         vm_object_t             old_copy_object; /* Saved copy object */
1814         vm_page_t               result_page;    /* Result of vm_fault_page */
1815         vm_page_t               top_page;       /* Placeholder page */
1816         kern_return_t           kr;
1817
1818         register
1819         vm_page_t               m;      /* Fast access to result_page */
1820         kern_return_t           error_code;     /* page error reasons */
1821         register
1822         vm_object_t             cur_object;
1823         register
1824         vm_object_offset_t      cur_offset;
1825         vm_page_t               cur_m;
1826         vm_object_t             new_object;
1827         int                     type_of_fault;
1828         vm_map_t                pmap_map = map;
1829         vm_map_t                original_map = map;
1830         pmap_t                  pmap = NULL;
1831         boolean_t               funnel_set = FALSE;
1832         funnel_t                *curflock;
1833         thread_t                cur_thread;
1834         boolean_t               interruptible_state;
1835         unsigned int            cache_attr;
1836         int                     write_startup_file = 0;
1837         vm_prot_t               full_fault_type;
1838
1839
1840         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_START,
1841                               vaddr,
1842                               0,
1843                               0,
1844                               0,
1845                               0);
1846
1847         /* at present we do not fully check for execute permission */
1848         /* we generally treat it is read except in certain device  */
1849         /* memory settings */
1850         full_fault_type = fault_type;
1851         if(fault_type & VM_PROT_EXECUTE) {
1852                 fault_type &= ~VM_PROT_EXECUTE;
1853                 fault_type |= VM_PROT_READ;
1854         }
1855
1856         interruptible_state = thread_interrupt_level(interruptible);
1857
1858         /*
1859          * assume we will hit a page in the cache
1860          * otherwise, explicitly override with
1861          * the real fault type once we determine it
1862          */
1863         type_of_fault = DBG_CACHE_HIT_FAULT;
1864
1865         VM_STAT(faults++);
1866         current_task()->faults++;
1867
1868         /*
1869          * drop funnel if it is already held. Then restore while returning
1870          */
1871         cur_thread = current_thread();
1872
1873         if ((cur_thread->funnel_state & TH_FN_OWNED) == TH_FN_OWNED) {
1874                 funnel_set = TRUE;
1875                 curflock = cur_thread->funnel_lock;
1876                 thread_funnel_set( curflock , FALSE);
1877         }
1878
1879     RetryFault: ;
1880
1881         /*
1882          *      Find the backing store object and offset into
1883          *      it to begin the search.
1884          */
1885         map = original_map;
1886         vm_map_lock_read(map);
1887         kr = vm_map_lookup_locked(&map, vaddr, fault_type, &version,
1888                                 &object, &offset,
1889                                 &prot, &wired,
1890                                 &behavior, &lo_offset, &hi_offset, &pmap_map);
1891
1892         pmap = pmap_map->pmap;
1893
1894         if (kr != KERN_SUCCESS) {
1895                 vm_map_unlock_read(map);
1896                 goto done;
1897         }
1898
1899         /*
1900          *      If the page is wired, we must fault for the current protection
1901          *      value, to avoid further faults.
1902          */
1903
1904         if (wired)
1905                 fault_type = prot | VM_PROT_WRITE;
1906
1907 #if     VM_FAULT_CLASSIFY
1908         /*
1909          *      Temporary data gathering code
1910          */
1911         vm_fault_classify(object, offset, fault_type);
1912 #endif
1913         /*
1914          *      Fast fault code.  The basic idea is to do as much as
1915          *      possible while holding the map lock and object locks.
1916          *      Busy pages are not used until the object lock has to
1917          *      be dropped to do something (copy, zero fill, pmap enter).
1918          *      Similarly, paging references aren't acquired until that
1919          *      point, and object references aren't used.
1920          *
1921          *      If we can figure out what to do
1922          *      (zero fill, copy on write, pmap enter) while holding
1923          *      the locks, then it gets done.  Otherwise, we give up,
1924          *      and use the original fault path (which doesn't hold
1925          *      the map lock, and relies on busy pages).
1926          *      The give up cases include:
1927          *              - Have to talk to pager.
1928          *              - Page is busy, absent or in error.
1929          *              - Pager has locked out desired access.
1930          *              - Fault needs to be restarted.
1931          *              - Have to push page into copy object.
1932          *
1933          *      The code is an infinite loop that moves one level down
1934          *      the shadow chain each time.  cur_object and cur_offset
1935          *      refer to the current object being examined. object and offset
1936          *      are the original object from the map.  The loop is at the
1937          *      top level if and only if object and cur_object are the same.
1938          *
1939          *      Invariants:  Map lock is held throughout.  Lock is held on
1940          *              original object and cur_object (if different) when
1941          *              continuing or exiting loop.
1942          *
1943          */
1944
1945
1946         /*
1947          *      If this page is to be inserted in a copy delay object
1948          *      for writing, and if the object has a copy, then the
1949          *      copy delay strategy is implemented in the slow fault page.
1950          */
1951         if (object->copy_strategy != MEMORY_OBJECT_COPY_DELAY ||
1952             object->copy == VM_OBJECT_NULL ||
1953             (fault_type & VM_PROT_WRITE) == 0) {
1954         cur_object = object;
1955         cur_offset = offset;
1956
1957         while (TRUE) {
1958                 m = vm_page_lookup(cur_object, cur_offset);
1959                 if (m != VM_PAGE_NULL) {
1960                         if (m->busy) {
1961                                 wait_result_t   result;
1962
1963                                 if (object != cur_object)
1964                                         vm_object_unlock(object);
1965
1966                                 vm_map_unlock_read(map);
1967                                 if (pmap_map != map)
1968                                         vm_map_unlock(pmap_map);
1969
1970 #if     !VM_FAULT_STATIC_CONFIG
1971                                 if (!vm_fault_interruptible)
1972                                         interruptible = THREAD_UNINT;
1973 #endif
1974                                 result = PAGE_ASSERT_WAIT(m, interruptible);
1975
1976                                 vm_object_unlock(cur_object);
1977
1978                                 if (result == THREAD_WAITING) {
1979                                         result = thread_block(THREAD_CONTINUE_NULL);
1980
1981                                         counter(c_vm_fault_page_block_busy_kernel++);
1982                                 }
1983                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
1984                                         goto RetryFault;
1985
1986                                 kr = KERN_ABORTED;
1987                                 goto done;
1988                         }
1989                         if (m->unusual && (m->error || m->restart || m->private
1990                             || m->absent || (fault_type & m->page_lock))) {
1991
1992                                 /*
1993                                  *      Unusual case. Give up.
1994                                  */
1995                                 break;
1996                         }
1997
1998                         /*
1999                          *      Two cases of map in faults:
2000                          *          - At top level w/o copy object.
2001                          *          - Read fault anywhere.
2002                          *              --> must disallow write.
2003                          */
2004
2005                         if (object == cur_object &&
2006                             object->copy == VM_OBJECT_NULL)
2007                                 goto FastMapInFault;
2008
2009                         if ((fault_type & VM_PROT_WRITE) == 0) {
2010
2011                                 prot &= ~VM_PROT_WRITE;
2012
2013                                 /*
2014                                  *      Set up to map the page ...
2015                                  *      mark the page busy, drop
2016                                  *      locks and take a paging reference
2017                                  *      on the object with the page.
2018                                  */
2019
2020                                 if (object != cur_object) {
2021                                         vm_object_unlock(object);
2022                                         object = cur_object;
2023                                 }
2024 FastMapInFault:
2025                                 m->busy = TRUE;
2026
2027                                 vm_object_paging_begin(object);
2028
2029 FastPmapEnter:
2030                                 /*
2031                                  *      Check a couple of global reasons to
2032                                  *      be conservative about write access.
2033                                  *      Then do the pmap_enter.
2034                                  */
2035 #if     !VM_FAULT_STATIC_CONFIG
2036                                 if (vm_fault_dirty_handling
2037 #if     MACH_KDB
2038                                     || db_watchpoint_list
2039 #endif
2040                                     && (fault_type & VM_PROT_WRITE) == 0)
2041                                         prot &= ~VM_PROT_WRITE;
2042 #else   /* STATIC_CONFIG */
2043 #if     MACH_KDB
2044                                 if (db_watchpoint_list
2045                                     && (fault_type & VM_PROT_WRITE) == 0)
2046                                         prot &= ~VM_PROT_WRITE;
2047 #endif  /* MACH_KDB */
2048 #endif  /* STATIC_CONFIG */
2049                                 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2050                                 if ((m->no_isync == TRUE) ||
2051                                            (cache_attr != VM_WIMG_DEFAULT)) {
2052                                         pmap_sync_caches_phys(m->phys_page);
2053                                         m->no_isync = FALSE;
2054                                 }
2055
2056                                 if(caller_pmap) {
2057                                         PMAP_ENTER(caller_pmap,
2058                                                 caller_pmap_addr, m,
2059                                                 prot, cache_attr, wired);
2060                                 } else {
2061                                         PMAP_ENTER(pmap, vaddr, m,
2062                                                 prot, cache_attr, wired);
2063                                 }
2064
2065                                 /*
2066                                  *      Grab the queues lock to manipulate
2067                                  *      the page queues.  Change wiring
2068                                  *      case is obvious.  In soft ref bits
2069                                  *      case activate page only if it fell
2070                                  *      off paging queues, otherwise just
2071                                  *      activate it if it's inactive.
2072                                  *
2073                                  *      NOTE: original vm_fault code will
2074                                  *      move active page to back of active
2075                                  *      queue.  This code doesn't.
2076                                  */
2077                                 vm_page_lock_queues();
2078
2079                                 if (m->clustered) {
2080                                         vm_pagein_cluster_used++;
2081                                         m->clustered = FALSE;
2082                                 }
2083                                 m->reference = TRUE;
2084
2085                                 if (change_wiring) {
2086                                         if (wired)
2087                                                 vm_page_wire(m);
2088                                         else
2089                                                 vm_page_unwire(m);
2090                                 }
2091 #if VM_FAULT_STATIC_CONFIG
2092                                 else {
2093                                         if (!m->active && !m->inactive)
2094                                                 vm_page_activate(m);
2095                                 }
2096 #else
2097                                 else if (software_reference_bits) {
2098                                         if (!m->active && !m->inactive)
2099                                                 vm_page_activate(m);
2100                                 }
2101                                 else if (!m->active) {
2102                                         vm_page_activate(m);
2103                                 }
2104 #endif
2105                                 vm_page_unlock_queues();
2106
2107                                 /*
2108                                  *      That's it, clean up and return.
2109                                  */
2110                                 PAGE_WAKEUP_DONE(m);
2111                                 vm_object_paging_end(object);
2112
2113                                 {
2114                                    tws_hash_line_t      line;
2115                                    task_t               task;
2116
2117                                    task = current_task();
2118                                    if((map != NULL) &&
2119                                         (task->dynamic_working_set != 0) &&
2120                                                 !(object->private)) {
2121                                         kern_return_t   kr;
2122                                         vm_object_t     base_object;
2123                                         vm_object_offset_t base_offset;
2124                                         base_object = object;
2125                                         base_offset = cur_offset;
2126                                         while(base_object->shadow) {
2127                                                 base_offset +=
2128                                                  base_object->shadow_offset;
2129                                                 base_object =
2130                                                  base_object->shadow;
2131                                         }
2132                                         kr = tws_lookup((tws_hash_t)
2133                                                 task->dynamic_working_set,
2134                                                 base_offset, base_object,
2135                                                 &line);
2136                                         if(kr == KERN_OPERATION_TIMED_OUT){
2137                                                 write_startup_file = 1;
2138                                         } else if (kr != KERN_SUCCESS) {
2139                                                 kr = tws_insert((tws_hash_t)
2140                                                    task->dynamic_working_set,
2141                                                    base_offset, base_object,
2142                                                    vaddr, pmap_map);
2143                                                 if(kr == KERN_NO_SPACE) {
2144                                                   vm_object_unlock(object);
2145
2146                                                    tws_expand_working_set(
2147                                                       task->dynamic_working_set,
2148                                                       TWS_HASH_LINE_COUNT,
2149                                                       FALSE);
2150
2151                                                    vm_object_lock(object);
2152                                                 }
2153                                                 if(kr ==
2154                                                    KERN_OPERATION_TIMED_OUT) {
2155                                                         write_startup_file = 1;
2156                                                 }
2157                                         }
2158                                    }
2159                                 }
2160                                 vm_object_unlock(object);
2161
2162                                 vm_map_unlock_read(map);
2163                                 if(pmap_map != map)
2164                                         vm_map_unlock(pmap_map);
2165
2166                                 if(write_startup_file)
2167                                         tws_send_startup_info(current_task());
2168
2169                                 if (funnel_set)
2170                                         thread_funnel_set( curflock, TRUE);
2171
2172                                 thread_interrupt_level(interruptible_state);
2173
2174
2175                                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2176                                                       vaddr,
2177                                                       type_of_fault & 0xff,
2178                                                       KERN_SUCCESS,
2179                                                       type_of_fault >> 8,
2180                                                       0);
2181
2182                                 return KERN_SUCCESS;
2183                         }
2184
2185                         /*
2186                          *      Copy on write fault.  If objects match, then
2187                          *      object->copy must not be NULL (else control
2188                          *      would be in previous code block), and we
2189                          *      have a potential push into the copy object
2190                          *      with which we won't cope here.
2191                          */
2192
2193                         if (cur_object == object)
2194                                 break;
2195                         /*
2196                          *      This is now a shadow based copy on write
2197                          *      fault -- it requires a copy up the shadow
2198                          *      chain.
2199                          *
2200                          *      Allocate a page in the original top level
2201                          *      object. Give up if allocate fails.  Also
2202                          *      need to remember current page, as it's the
2203                          *      source of the copy.
2204                          */
2205                         cur_m = m;
2206                         m = vm_page_grab();
2207                         if (m == VM_PAGE_NULL) {
2208                                 break;
2209                         }
2210                         /*
2211                          *      Now do the copy.  Mark the source busy
2212                          *      and take out paging references on both
2213                          *      objects.
2214                          *
2215                          *      NOTE: This code holds the map lock across
2216                          *      the page copy.
2217                          */
2218
2219                         cur_m->busy = TRUE;
2220                         vm_page_copy(cur_m, m);
2221                         vm_page_insert(m, object, offset);
2222
2223                         vm_object_paging_begin(cur_object);
2224                         vm_object_paging_begin(object);
2225
2226                         type_of_fault = DBG_COW_FAULT;
2227                         VM_STAT(cow_faults++);
2228                         current_task()->cow_faults++;
2229
2230                         /*
2231                          *      Now cope with the source page and object
2232                          *      If the top object has a ref count of 1
2233                          *      then no other map can access it, and hence
2234                          *      it's not necessary to do the pmap_page_protect.
2235                          */
2236
2237
2238                         vm_page_lock_queues();
2239                         vm_page_deactivate(cur_m);
2240                         m->dirty = TRUE;
2241                         pmap_page_protect(cur_m->phys_page,
2242                                                   VM_PROT_NONE);
2243                         vm_page_unlock_queues();
2244
2245                         PAGE_WAKEUP_DONE(cur_m);
2246                         vm_object_paging_end(cur_object);
2247                         vm_object_unlock(cur_object);
2248
2249                         /*
2250                          *      Slight hack to call vm_object collapse
2251                          *      and then reuse common map in code.
2252                          *      note that the object lock was taken above.
2253                          */
2254
2255                         vm_object_paging_end(object);
2256                         vm_object_collapse(object);
2257                         vm_object_paging_begin(object);
2258
2259                         goto FastPmapEnter;
2260                 }
2261                 else {
2262
2263                         /*
2264                          *      No page at cur_object, cur_offset
2265                          */
2266
2267                         if (cur_object->pager_created) {
2268
2269                                 /*
2270                                  *      Have to talk to the pager.  Give up.
2271                                  */
2272                                 break;
2273                         }
2274
2275
2276                         if (cur_object->shadow == VM_OBJECT_NULL) {
2277
2278                                 if (cur_object->shadow_severed) {
2279                                         vm_object_paging_end(object);
2280                                         vm_object_unlock(object);
2281                                         vm_map_unlock_read(map);
2282                                         if(pmap_map != map)
2283                                                 vm_map_unlock(pmap_map);
2284
2285                                         if(write_startup_file)
2286                                                 tws_send_startup_info(
2287                                                                 current_task());
2288
2289                                         if (funnel_set) {
2290                                                 thread_funnel_set( curflock, TRUE);
2291                                                 funnel_set = FALSE;
2292                                         }
2293                                         thread_interrupt_level(interruptible_state);
2294
2295                                         return VM_FAULT_MEMORY_ERROR;
2296                                 }
2297
2298                                 /*
2299                                  *      Zero fill fault.  Page gets
2300                                  *      filled in top object. Insert
2301                                  *      page, then drop any lower lock.
2302                                  *      Give up if no page.
2303                                  */
2304                                 if ((vm_page_free_target -
2305                                    ((vm_page_free_target-vm_page_free_min)>>2))
2306                                                 > vm_page_free_count) {
2307                                         break;
2308                                 }
2309                                 m = vm_page_alloc(object, offset);
2310                                 if (m == VM_PAGE_NULL) {
2311                                         break;
2312                                 }
2313                                 /*
2314                                  * This is a zero-fill or initial fill
2315                                  * page fault.  As such, we consider it
2316                                  * undefined with respect to instruction
2317                                  * execution.  i.e. it is the responsibility
2318                                  * of higher layers to call for an instruction
2319                                  * sync after changing the contents and before
2320                                  * sending a program into this area.  We
2321                                  * choose this approach for performance
2322                                  */
2323
2324                                 m->no_isync = FALSE;
2325
2326                                 if (cur_object != object)
2327                                         vm_object_unlock(cur_object);
2328
2329                                 vm_object_paging_begin(object);
2330                                 vm_object_unlock(object);
2331
2332                                 /*
2333                                  *      Now zero fill page and map it.
2334                                  *      the page is probably going to
2335                                  *      be written soon, so don't bother
2336                                  *      to clear the modified bit
2337                                  *
2338                                  *      NOTE: This code holds the map
2339                                  *      lock across the zero fill.
2340                                  */
2341
2342                                 if (!map->no_zero_fill) {
2343                                         vm_page_zero_fill(m);
2344                                         type_of_fault = DBG_ZERO_FILL_FAULT;
2345                                         VM_STAT(zero_fill_count++);
2346                                 }
2347                                 vm_page_lock_queues();
2348                                 VM_PAGE_QUEUES_REMOVE(m);
2349
2350                                 m->page_ticket = vm_page_ticket;
2351                                 if(m->object->size > 0x80000) {
2352                                         m->zero_fill = TRUE;
2353                                         /* depends on the queues lock */
2354                                         vm_zf_count += 1;
2355                                         queue_enter(&vm_page_queue_zf,
2356                                                 m, vm_page_t, pageq);
2357                                 } else {
2358                                         queue_enter(
2359                                                 &vm_page_queue_inactive,
2360                                                 m, vm_page_t, pageq);
2361                                 }
2362                                 vm_page_ticket_roll++;
2363                                 if(vm_page_ticket_roll ==
2364                                                 VM_PAGE_TICKETS_IN_ROLL) {
2365                                         vm_page_ticket_roll = 0;
2366                                         if(vm_page_ticket ==
2367                                                 VM_PAGE_TICKET_ROLL_IDS)
2368                                                 vm_page_ticket= 0;
2369                                         else
2370                                                 vm_page_ticket++;
2371                                 }
2372
2373                                 m->inactive = TRUE;
2374                                 vm_page_inactive_count++;
2375                                 vm_page_unlock_queues();
2376                                 vm_object_lock(object);
2377
2378                                 goto FastPmapEnter;
2379                         }
2380
2381                         /*
2382                          *      On to the next level
2383                          */
2384
2385                         cur_offset += cur_object->shadow_offset;
2386                         new_object = cur_object->shadow;
2387                         vm_object_lock(new_object);
2388                         if (cur_object != object)
2389                                 vm_object_unlock(cur_object);
2390                         cur_object = new_object;
2391
2392                         continue;
2393                 }
2394         }
2395
2396         /*
2397          *      Cleanup from fast fault failure.  Drop any object
2398          *      lock other than original and drop map lock.
2399          */
2400
2401         if (object != cur_object)
2402                 vm_object_unlock(cur_object);
2403         }
2404         vm_map_unlock_read(map);
2405
2406         if(pmap_map != map)
2407                 vm_map_unlock(pmap_map);
2408
2409         /*
2410          *      Make a reference to this object to
2411          *      prevent its disposal while we are messing with
2412          *      it.  Once we have the reference, the map is free
2413          *      to be diddled.  Since objects reference their
2414          *      shadows (and copies), they will stay around as well.
2415          */
2416
2417         assert(object->ref_count > 0);
2418         object->ref_count++;
2419         vm_object_res_reference(object);
2420         vm_object_paging_begin(object);
2421
2422         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2423         {
2424                 tws_hash_line_t line;
2425                 task_t          task;
2426                 kern_return_t   kr;
2427
2428                    task = current_task();
2429                    if((map != NULL) &&
2430                         (task->dynamic_working_set != 0)
2431                                 && !(object->private)) {
2432                         vm_object_t     base_object;
2433                         vm_object_offset_t base_offset;
2434                         base_object = object;
2435                         base_offset = offset;
2436                         while(base_object->shadow) {
2437                                 base_offset +=
2438                                  base_object->shadow_offset;
2439                                 base_object =
2440                                  base_object->shadow;
2441                         }
2442                         kr = tws_lookup((tws_hash_t)
2443                                 task->dynamic_working_set,
2444                                 base_offset, base_object,
2445                                 &line);
2446                         if(kr == KERN_OPERATION_TIMED_OUT){
2447                                 write_startup_file = 1;
2448                         } else if (kr != KERN_SUCCESS) {
2449                                 tws_insert((tws_hash_t)
2450                                    task->dynamic_working_set,
2451                                    base_offset, base_object,
2452                                    vaddr, pmap_map);
2453                                 kr = tws_insert((tws_hash_t)
2454                                            task->dynamic_working_set,
2455                                            base_offset, base_object,
2456                                            vaddr, pmap_map);
2457                                 if(kr == KERN_NO_SPACE) {
2458                                         vm_object_unlock(object);
2459                                         tws_expand_working_set(
2460                                            task->dynamic_working_set,
2461                                            TWS_HASH_LINE_COUNT,
2462                                            FALSE);
2463                                         vm_object_lock(object);
2464                                 }
2465                                 if(kr == KERN_OPERATION_TIMED_OUT) {
2466                                         write_startup_file = 1;
2467                                 }
2468                         }
2469                 }
2470         }
2471         kr = vm_fault_page(object, offset, fault_type,
2472                            (change_wiring && !wired),
2473                            interruptible,
2474                            lo_offset, hi_offset, behavior,
2475                            &prot, &result_page, &top_page,
2476                            &type_of_fault,
2477                            &error_code, map->no_zero_fill, FALSE, map, vaddr);
2478
2479         /*
2480          *      If we didn't succeed, lose the object reference immediately.
2481          */
2482
2483         if (kr != VM_FAULT_SUCCESS)
2484                 vm_object_deallocate(object);
2485
2486         /*
2487          *      See why we failed, and take corrective action.
2488          */
2489
2490         switch (kr) {
2491                 case VM_FAULT_SUCCESS:
2492                         break;
2493                 case VM_FAULT_MEMORY_SHORTAGE:
2494                         if (vm_page_wait((change_wiring) ?
2495                                          THREAD_UNINT :
2496                                          THREAD_ABORTSAFE))
2497                                 goto RetryFault;
2498                         /* fall thru */
2499                 case VM_FAULT_INTERRUPTED:
2500                         kr = KERN_ABORTED;
2501                         goto done;
2502                 case VM_FAULT_RETRY:
2503                         goto RetryFault;
2504                 case VM_FAULT_FICTITIOUS_SHORTAGE:
2505                         vm_page_more_fictitious();
2506                         goto RetryFault;
2507                 case VM_FAULT_MEMORY_ERROR:
2508                         if (error_code)
2509                                 kr = error_code;
2510                         else
2511                                 kr = KERN_MEMORY_ERROR;
2512                         goto done;
2513         }
2514
2515         m = result_page;
2516
2517         if(m != VM_PAGE_NULL) {
2518                 assert((change_wiring && !wired) ?
2519                     (top_page == VM_PAGE_NULL) :
2520                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
2521         }
2522
2523         /*
2524          *      How to clean up the result of vm_fault_page.  This
2525          *      happens whether the mapping is entered or not.
2526          */
2527
2528 #define UNLOCK_AND_DEALLOCATE                           \
2529         MACRO_BEGIN                                     \
2530         vm_fault_cleanup(m->object, top_page);          \
2531         vm_object_deallocate(object);                   \
2532         MACRO_END
2533
2534         /*
2535          *      What to do with the resulting page from vm_fault_page
2536          *      if it doesn't get entered into the physical map:
2537          */
2538
2539 #define RELEASE_PAGE(m)                                 \
2540         MACRO_BEGIN                                     \
2541         PAGE_WAKEUP_DONE(m);                            \
2542         vm_page_lock_queues();                          \
2543         if (!m->active && !m->inactive)                 \
2544                 vm_page_activate(m);                    \
2545         vm_page_unlock_queues();                        \
2546         MACRO_END
2547
2548         /*
2549          *      We must verify that the maps have not changed
2550          *      since our last lookup.
2551          */
2552
2553         if(m != VM_PAGE_NULL) {
2554                 old_copy_object = m->object->copy;
2555                 vm_object_unlock(m->object);
2556         } else {
2557                 old_copy_object = VM_OBJECT_NULL;
2558         }
2559         if ((map != original_map) || !vm_map_verify(map, &version)) {
2560                 vm_object_t             retry_object;
2561                 vm_object_offset_t      retry_offset;
2562                 vm_prot_t               retry_prot;
2563
2564                 /*
2565                  *      To avoid trying to write_lock the map while another
2566                  *      thread has it read_locked (in vm_map_pageable), we
2567                  *      do not try for write permission.  If the page is
2568                  *      still writable, we will get write permission.  If it
2569                  *      is not, or has been marked needs_copy, we enter the
2570                  *      mapping without write permission, and will merely
2571                  *      take another fault.
2572                  */
2573                 map = original_map;
2574                 vm_map_lock_read(map);
2575                 kr = vm_map_lookup_locked(&map, vaddr,
2576                                    fault_type & ~VM_PROT_WRITE, &version,
2577                                    &retry_object, &retry_offset, &retry_prot,
2578                                    &wired, &behavior, &lo_offset, &hi_offset,
2579                                    &pmap_map);
2580                 pmap = pmap_map->pmap;
2581
2582                 if (kr != KERN_SUCCESS) {
2583                         vm_map_unlock_read(map);
2584                         if(m != VM_PAGE_NULL) {
2585                                 vm_object_lock(m->object);
2586                                 RELEASE_PAGE(m);
2587                                 UNLOCK_AND_DEALLOCATE;
2588                         } else {
2589                                 vm_object_deallocate(object);
2590                         }
2591                         goto done;
2592                 }
2593
2594                 vm_object_unlock(retry_object);
2595                 if(m != VM_PAGE_NULL) {
2596                         vm_object_lock(m->object);
2597                 } else {
2598                         vm_object_lock(object);
2599                 }
2600
2601                 if ((retry_object != object) ||
2602                     (retry_offset != offset)) {
2603                         vm_map_unlock_read(map);
2604                         if(pmap_map != map)
2605                                 vm_map_unlock(pmap_map);
2606                         if(m != VM_PAGE_NULL) {
2607                                 RELEASE_PAGE(m);
2608                                 UNLOCK_AND_DEALLOCATE;
2609                         } else {
2610                                 vm_object_deallocate(object);
2611                         }
2612                         goto RetryFault;
2613                 }
2614
2615                 /*
2616                  *      Check whether the protection has changed or the object
2617                  *      has been copied while we left the map unlocked.
2618                  */
2619                 prot &= retry_prot;
2620                 if(m != VM_PAGE_NULL) {
2621                         vm_object_unlock(m->object);
2622                 } else {
2623                         vm_object_unlock(object);
2624                 }
2625         }
2626         if(m != VM_PAGE_NULL) {
2627                 vm_object_lock(m->object);
2628         } else {
2629                 vm_object_lock(object);
2630         }
2631
2632         /*
2633          *      If the copy object changed while the top-level object
2634          *      was unlocked, then we must take away write permission.
2635          */
2636
2637         if(m != VM_PAGE_NULL) {
2638                 if (m->object->copy != old_copy_object)
2639                         prot &= ~VM_PROT_WRITE;
2640         }
2641
2642         /*
2643          *      If we want to wire down this page, but no longer have
2644          *      adequate permissions, we must start all over.
2645          */
2646
2647         if (wired && (fault_type != (prot|VM_PROT_WRITE))) {
2648                 vm_map_verify_done(map, &version);
2649                 if(pmap_map != map)
2650                         vm_map_unlock(pmap_map);
2651                 if(m != VM_PAGE_NULL) {
2652                         RELEASE_PAGE(m);
2653                         UNLOCK_AND_DEALLOCATE;
2654                 } else {
2655                         vm_object_deallocate(object);
2656                 }
2657                 goto RetryFault;
2658         }
2659
2660         /*
2661          *      Put this page into the physical map.
2662          *      We had to do the unlock above because pmap_enter
2663          *      may cause other faults.  The page may be on
2664          *      the pageout queues.  If the pageout daemon comes
2665          *      across the page, it will remove it from the queues.
2666          */
2667         if (m != VM_PAGE_NULL) {
2668                 if (m->no_isync == TRUE) {
2669                         pmap_sync_caches_phys(m->phys_page);
2670
2671                         m->no_isync = FALSE;
2672                 }
2673
2674                 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2675
2676                 if(caller_pmap) {
2677                         PMAP_ENTER(caller_pmap,
2678                                         caller_pmap_addr, m,
2679                                         prot, cache_attr, wired);
2680                 } else {
2681                         PMAP_ENTER(pmap, vaddr, m,
2682                                         prot, cache_attr, wired);
2683                 }
2684                 {
2685                         tws_hash_line_t line;
2686                         task_t          task;
2687                         kern_return_t   kr;
2688
2689                            task = current_task();
2690                            if((map != NULL) &&
2691                                 (task->dynamic_working_set != 0)
2692                                         && (object->private)) {
2693                                 vm_object_t     base_object;
2694                                 vm_object_offset_t      base_offset;
2695                                 base_object = m->object;
2696                                 base_offset = m->offset;
2697                                 while(base_object->shadow) {
2698                                    base_offset +=
2699                                         base_object->shadow_offset;
2700                                    base_object =
2701                                         base_object->shadow;
2702                                 }
2703                                 kr = tws_lookup((tws_hash_t)
2704                                         task->dynamic_working_set,
2705                                         base_offset, base_object, &line);
2706                                 if(kr == KERN_OPERATION_TIMED_OUT){
2707                                         write_startup_file = 1;
2708                                 } else if (kr != KERN_SUCCESS) {
2709                                         tws_insert((tws_hash_t)
2710                                            task->dynamic_working_set,
2711                                            base_offset, base_object,
2712                                            vaddr, pmap_map);
2713                                         kr = tws_insert((tws_hash_t)
2714                                                    task->dynamic_working_set,
2715                                                    base_offset, base_object,
2716                                                    vaddr, pmap_map);
2717                                         if(kr == KERN_NO_SPACE) {
2718                                                 vm_object_unlock(m->object);
2719                                                 tws_expand_working_set(
2720                                                    task->dynamic_working_set,
2721                                                    TWS_HASH_LINE_COUNT,
2722                                                    FALSE);
2723                                                 vm_object_lock(m->object);
2724                                         }
2725                                         if(kr == KERN_OPERATION_TIMED_OUT) {
2726                                                 write_startup_file = 1;
2727                                         }
2728                                 }
2729                         }
2730                 }
2731         } else {
2732
2733 #ifndef i386
2734                 int                     memattr;
2735                 vm_map_entry_t          entry;
2736                 vm_offset_t             laddr;
2737                 vm_offset_t             ldelta, hdelta;
2738
2739                 /*
2740                  * do a pmap block mapping from the physical address
2741                  * in the object
2742                  */
2743
2744                 /* While we do not worry about execution protection in   */
2745                 /* general, certian pages may have instruction execution */
2746                 /* disallowed.  We will check here, and if not allowed   */
2747                 /* to execute, we return with a protection failure.      */
2748
2749                 if((full_fault_type & VM_PROT_EXECUTE) &&
2750                         (pmap_canExecute((ppnum_t)
2751                                 (object->shadow_offset >> 12)) < 1)) {
2752
2753                         vm_map_verify_done(map, &version);
2754                         if(pmap_map != map)
2755                                 vm_map_unlock(pmap_map);
2756                         vm_fault_cleanup(object, top_page);
2757                         vm_object_deallocate(object);
2758                         kr = KERN_PROTECTION_FAILURE;
2759                         goto done;
2760                 }
2761
2762                 if(pmap_map != map) {
2763                         vm_map_unlock(pmap_map);
2764                 }
2765                 if (original_map != map) {
2766                         vm_map_unlock_read(map);
2767                         vm_map_lock_read(original_map);
2768                         map = original_map;
2769                 }
2770                 pmap_map = map;
2771
2772                 laddr = vaddr;
2773                 hdelta = 0xFFFFF000;
2774                 ldelta = 0xFFFFF000;
2775
2776
2777                 while(vm_map_lookup_entry(map, laddr, &entry)) {
2778                         if(ldelta > (laddr - entry->vme_start))
2779                                 ldelta = laddr - entry->vme_start;
2780                         if(hdelta > (entry->vme_end - laddr))
2781                                 hdelta = entry->vme_end - laddr;
2782                         if(entry->is_sub_map) {
2783
2784                                 laddr = (laddr - entry->vme_start)
2785                                                         + entry->offset;
2786                                 vm_map_lock_read(entry->object.sub_map);
2787                                 if(map != pmap_map)
2788                                         vm_map_unlock_read(map);
2789                                 if(entry->use_pmap) {
2790                                         vm_map_unlock_read(pmap_map);
2791                                         pmap_map = entry->object.sub_map;
2792                                 }
2793                                 map = entry->object.sub_map;
2794
2795                         } else {
2796                                 break;
2797                         }
2798                 }
2799
2800                 if(vm_map_lookup_entry(map, laddr, &entry) &&
2801                                         (entry->object.vm_object != NULL) &&
2802                                         (entry->object.vm_object == object)) {
2803
2804
2805                         if(caller_pmap) {
2806                                 /* Set up a block mapped area */
2807                                 pmap_map_block(caller_pmap,
2808                                         (addr64_t)(caller_pmap_addr - ldelta),
2809                                         (((vm_offset_t)
2810                                     (entry->object.vm_object->shadow_offset))
2811                                         + entry->offset +
2812                                         (laddr - entry->vme_start)
2813                                                         - ldelta)>>12,
2814                                 ldelta + hdelta, prot,
2815                                 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
2816                         } else {
2817                                 /* Set up a block mapped area */
2818                                 pmap_map_block(pmap_map->pmap,
2819                                    (addr64_t)(vaddr - ldelta),
2820                                    (((vm_offset_t)
2821                                     (entry->object.vm_object->shadow_offset))
2822                                        + entry->offset +
2823                                        (laddr - entry->vme_start) - ldelta)>>12,
2824                                    ldelta + hdelta, prot,
2825                                    (VM_WIMG_MASK & (int)object->wimg_bits), 0);
2826                         }
2827                 }
2828 #else
2829 #ifdef notyet
2830                 if(caller_pmap) {
2831                         pmap_enter(caller_pmap, caller_pmap_addr,
2832                                 object->shadow_offset>>12, prot, 0, TRUE);
2833                 } else {
2834                         pmap_enter(pmap, vaddr,
2835                                 object->shadow_offset>>12, prot, 0, TRUE);
2836                 }
2837                         /* Map it in */
2838 #endif
2839 #endif
2840
2841         }
2842
2843         /*
2844          *      If the page is not wired down and isn't already
2845          *      on a pageout queue, then put it where the
2846          *      pageout daemon can find it.
2847          */
2848         if(m != VM_PAGE_NULL) {
2849                 vm_page_lock_queues();
2850
2851                 if (change_wiring) {
2852                         if (wired)
2853                                 vm_page_wire(m);
2854                         else
2855                                 vm_page_unwire(m);
2856                 }
2857 #if     VM_FAULT_STATIC_CONFIG
2858                 else {
2859                         if (!m->active && !m->inactive)
2860                                 vm_page_activate(m);
2861                         m->reference = TRUE;
2862                 }
2863 #else
2864                 else if (software_reference_bits) {
2865                         if (!m->active && !m->inactive)
2866                                 vm_page_activate(m);
2867                         m->reference = TRUE;
2868                 } else {
2869                         vm_page_activate(m);
2870                 }
2871 #endif
2872                 vm_page_unlock_queues();
2873         }
2874
2875         /*
2876          *      Unlock everything, and return
2877          */
2878
2879         vm_map_verify_done(map, &version);
2880         if(pmap_map != map)
2881                 vm_map_unlock(pmap_map);
2882         if(m != VM_PAGE_NULL) {
2883                 PAGE_WAKEUP_DONE(m);
2884                 UNLOCK_AND_DEALLOCATE;
2885         } else {
2886                 vm_fault_cleanup(object, top_page);
2887                 vm_object_deallocate(object);
2888         }
2889         kr = KERN_SUCCESS;
2890
2891 #undef  UNLOCK_AND_DEALLOCATE
2892 #undef  RELEASE_PAGE
2893
2894     done:
2895         if(write_startup_file)
2896                 tws_send_startup_info(current_task());
2897         if (funnel_set) {
2898                 thread_funnel_set( curflock, TRUE);
2899                 funnel_set = FALSE;
2900         }
2901         thread_interrupt_level(interruptible_state);
2902
2903         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2904                               vaddr,
2905                               type_of_fault & 0xff,
2906                               kr,
2907                               type_of_fault >> 8,
2908                               0);
2909
2910         return(kr);
2911 }
2912
2913 /*
2914  *      vm_fault_wire:
2915  *
2916  *      Wire down a range of virtual addresses in a map.
2917  */
2918 kern_return_t
2919 vm_fault_wire(
2920         vm_map_t        map,
2921         vm_map_entry_t  entry,
2922         pmap_t          pmap,
2923         vm_offset_t     pmap_addr)
2924 {
2925
2926         register vm_offset_t    va;
2927         register vm_offset_t    end_addr = entry->vme_end;
2928         register kern_return_t  rc;
2929
2930         assert(entry->in_transition);
2931
2932         if ((entry->object.vm_object != NULL) &&
2933                         !entry->is_sub_map &&
2934                         entry->object.vm_object->phys_contiguous) {
2935                 return KERN_SUCCESS;
2936         }
2937
2938         /*
2939          *      Inform the physical mapping system that the
2940          *      range of addresses may not fault, so that
2941          *      page tables and such can be locked down as well.
2942          */
2943
2944         pmap_pageable(pmap, pmap_addr,
2945                 pmap_addr + (end_addr - entry->vme_start), FALSE);
2946
2947         /*
2948          *      We simulate a fault to get the page and enter it
2949          *      in the physical map.
2950          */
2951
2952         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
2953                 if ((rc = vm_fault_wire_fast(
2954                         map, va, entry, pmap,
2955                         pmap_addr + (va - entry->vme_start)
2956                         )) != KERN_SUCCESS) {
2957                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
2958                                 (pmap == kernel_pmap) ?
2959                                         THREAD_UNINT : THREAD_ABORTSAFE,
2960                                 pmap, pmap_addr + (va - entry->vme_start));
2961                 }
2962
2963                 if (rc != KERN_SUCCESS) {
2964                         struct vm_map_entry     tmp_entry = *entry;
2965
2966                         /* unwire wired pages */
2967                         tmp_entry.vme_end = va;
2968                         vm_fault_unwire(map,
2969                                 &tmp_entry, FALSE, pmap, pmap_addr);
2970
2971                         return rc;
2972                 }
2973         }
2974         return KERN_SUCCESS;
2975 }
2976
2977 /*
2978  *      vm_fault_unwire:
2979  *
2980  *      Unwire a range of virtual addresses in a map.
2981  */
2982 void
2983 vm_fault_unwire(
2984         vm_map_t        map,
2985         vm_map_entry_t  entry,
2986         boolean_t       deallocate,
2987         pmap_t          pmap,
2988         vm_offset_t     pmap_addr)
2989 {
2990         register vm_offset_t    va;
2991         register vm_offset_t    end_addr = entry->vme_end;
2992         vm_object_t             object;
2993
2994         object = (entry->is_sub_map)
2995                         ? VM_OBJECT_NULL : entry->object.vm_object;
2996
2997         /*
2998          *      Since the pages are wired down, we must be able to
2999          *      get their mappings from the physical map system.
3000          */
3001
3002         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3003                 pmap_change_wiring(pmap,
3004                         pmap_addr + (va - entry->vme_start), FALSE);
3005
3006                 if (object == VM_OBJECT_NULL) {
3007                         (void) vm_fault(map, va, VM_PROT_NONE,
3008                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
3009                 } else if (object->phys_contiguous) {
3010                         continue;
3011                 } else {
3012                         vm_prot_t       prot;
3013                         vm_page_t       result_page;
3014                         vm_page_t       top_page;
3015                         vm_object_t     result_object;
3016                         vm_fault_return_t result;
3017
3018                         do {
3019                                 prot = VM_PROT_NONE;
3020
3021                                 vm_object_lock(object);
3022                                 vm_object_paging_begin(object);
3023                                 XPR(XPR_VM_FAULT,
3024                                         "vm_fault_unwire -> vm_fault_page\n",
3025                                         0,0,0,0,0);
3026                                 result = vm_fault_page(object,
3027                                                 entry->offset +
3028                                                   (va - entry->vme_start),
3029                                                 VM_PROT_NONE, TRUE,
3030                                                 THREAD_UNINT,
3031                                                 entry->offset,
3032                                                 entry->offset +
3033                                                        (entry->vme_end
3034                                                         - entry->vme_start),
3035                                                 entry->behavior,
3036                                                 &prot,
3037                                                 &result_page,
3038                                                 &top_page,
3039                                                 (int *)0,
3040                                                 0, map->no_zero_fill,
3041                                                 FALSE, NULL, 0);
3042                         } while (result == VM_FAULT_RETRY);
3043
3044                         if (result != VM_FAULT_SUCCESS)
3045                                 panic("vm_fault_unwire: failure");
3046
3047                         result_object = result_page->object;
3048                         if (deallocate) {
3049                                 assert(!result_page->fictitious);
3050                                 pmap_page_protect(result_page->phys_page,
3051                                                 VM_PROT_NONE);
3052                                 VM_PAGE_FREE(result_page);
3053                         } else {
3054                                 vm_page_lock_queues();
3055                                 vm_page_unwire(result_page);
3056                                 vm_page_unlock_queues();
3057                                 PAGE_WAKEUP_DONE(result_page);
3058                         }
3059
3060                         vm_fault_cleanup(result_object, top_page);
3061                 }
3062         }
3063
3064         /*
3065          *      Inform the physical mapping system that the range
3066          *      of addresses may fault, so that page tables and
3067          *      such may be unwired themselves.
3068          */
3069
3070         pmap_pageable(pmap, pmap_addr,
3071                 pmap_addr + (end_addr - entry->vme_start), TRUE);
3072
3073 }
3074
3075 /*
3076  *      vm_fault_wire_fast:
3077  *
3078  *      Handle common case of a wire down page fault at the given address.
3079  *      If successful, the page is inserted into the associated physical map.
3080  *      The map entry is passed in to avoid the overhead of a map lookup.
3081  *
3082  *      NOTE: the given address should be truncated to the
3083  *      proper page address.
3084  *
3085  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
3086  *      a standard error specifying why the fault is fatal is returned.
3087  *
3088  *      The map in question must be referenced, and remains so.
3089  *      Caller has a read lock on the map.
3090  *
3091  *      This is a stripped version of vm_fault() for wiring pages.  Anything
3092  *      other than the common case will return KERN_FAILURE, and the caller
3093  *      is expected to call vm_fault().
3094  */
3095 kern_return_t
3096 vm_fault_wire_fast(
3097         vm_map_t        map,
3098         vm_offset_t     va,
3099         vm_map_entry_t  entry,
3100         pmap_t          pmap,
3101         vm_offset_t     pmap_addr)
3102 {
3103         vm_object_t             object;
3104         vm_object_offset_t      offset;
3105         register vm_page_t      m;
3106         vm_prot_t               prot;
3107         thread_act_t            thr_act;
3108         unsigned int            cache_attr;
3109
3110         VM_STAT(faults++);
3111
3112         if((thr_act=current_act()) && (thr_act->task != TASK_NULL))
3113           thr_act->task->faults++;
3114
3115 /*
3116  *      Recovery actions
3117  */
3118
3119 #undef  RELEASE_PAGE
3120 #define RELEASE_PAGE(m) {                               \
3121         PAGE_WAKEUP_DONE(m);                            \
3122         vm_page_lock_queues();                          \
3123         vm_page_unwire(m);                              \
3124         vm_page_unlock_queues();                        \
3125 }
3126
3127
3128 #undef  UNLOCK_THINGS
3129 #define UNLOCK_THINGS   {                               \
3130         object->paging_in_progress--;                   \
3131         vm_object_unlock(object);                       \
3132 }
3133
3134 #undef  UNLOCK_AND_DEALLOCATE
3135 #define UNLOCK_AND_DEALLOCATE   {                       \
3136         UNLOCK_THINGS;                                  \
3137         vm_object_deallocate(object);                   \
3138 }
3139 /*
3140  *      Give up and have caller do things the hard way.
3141  */
3142
3143 #define GIVE_UP {                                       \
3144         UNLOCK_AND_DEALLOCATE;                          \
3145         return(KERN_FAILURE);                           \
3146 }
3147
3148
3149         /*
3150          *      If this entry is not directly to a vm_object, bail out.
3151          */
3152         if (entry->is_sub_map)
3153                 return(KERN_FAILURE);
3154
3155         /*
3156          *      Find the backing store object and offset into it.
3157          */
3158
3159         object = entry->object.vm_object;
3160         offset = (va - entry->vme_start) + entry->offset;
3161         prot = entry->protection;
3162
3163         /*
3164          *      Make a reference to this object to prevent its
3165          *      disposal while we are messing with it.
3166          */
3167
3168         vm_object_lock(object);
3169         assert(object->ref_count > 0);
3170         object->ref_count++;
3171         vm_object_res_reference(object);
3172         object->paging_in_progress++;
3173
3174         /*
3175          *      INVARIANTS (through entire routine):
3176          *
3177          *      1)      At all times, we must either have the object
3178          *              lock or a busy page in some object to prevent
3179          *              some other thread from trying to bring in
3180          *              the same page.
3181          *
3182          *      2)      Once we have a busy page, we must remove it from
3183          *              the pageout queues, so that the pageout daemon
3184          *              will not grab it away.
3185          *
3186          */
3187
3188         /*
3189          *      Look for page in top-level object.  If it's not there or
3190          *      there's something going on, give up.
3191          */
3192         m = vm_page_lookup(object, offset);
3193         if ((m == VM_PAGE_NULL) || (m->busy) ||
3194             (m->unusual && ( m->error || m->restart || m->absent ||
3195                                 prot & m->page_lock))) {
3196
3197                 GIVE_UP;
3198         }
3199
3200         /*
3201          *      Wire the page down now.  All bail outs beyond this
3202          *      point must unwire the page.
3203          */
3204
3205         vm_page_lock_queues();
3206         vm_page_wire(m);
3207         vm_page_unlock_queues();
3208
3209         /*
3210          *      Mark page busy for other threads.
3211          */
3212         assert(!m->busy);
3213         m->busy = TRUE;
3214         assert(!m->absent);
3215
3216         /*
3217          *      Give up if the page is being written and there's a copy object
3218          */
3219         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3220                 RELEASE_PAGE(m);
3221                 GIVE_UP;
3222         }
3223
3224         /*
3225          *      Put this page into the physical map.
3226          *      We have to unlock the object because pmap_enter
3227          *      may cause other faults.
3228          */
3229         if (m->no_isync == TRUE) {
3230                 pmap_sync_caches_phys(m->phys_page);
3231
3232                 m->no_isync = FALSE;
3233         }
3234
3235         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3236
3237         PMAP_ENTER(pmap, pmap_addr, m, prot, cache_attr, TRUE);
3238
3239         /*
3240          *      Unlock everything, and return
3241          */
3242
3243         PAGE_WAKEUP_DONE(m);
3244         UNLOCK_AND_DEALLOCATE;
3245
3246         return(KERN_SUCCESS);
3247
3248 }
3249
3250 /*
3251  *      Routine:        vm_fault_copy_cleanup
3252  *      Purpose:
3253  *              Release a page used by vm_fault_copy.
3254  */
3255
3256 void
3257 vm_fault_copy_cleanup(
3258         vm_page_t       page,
3259         vm_page_t       top_page)
3260 {
3261         vm_object_t     object = page->object;
3262
3263         vm_object_lock(object);
3264         PAGE_WAKEUP_DONE(page);
3265         vm_page_lock_queues();
3266         if (!page->active && !page->inactive)
3267                 vm_page_activate(page);
3268         vm_page_unlock_queues();
3269         vm_fault_cleanup(object, top_page);
3270 }
3271
3272 void
3273 vm_fault_copy_dst_cleanup(
3274         vm_page_t       page)
3275 {
3276         vm_object_t     object;
3277
3278         if (page != VM_PAGE_NULL) {
3279                 object = page->object;
3280                 vm_object_lock(object);
3281                 vm_page_lock_queues();
3282                 vm_page_unwire(page);
3283                 vm_page_unlock_queues();
3284                 vm_object_paging_end(object);
3285                 vm_object_unlock(object);
3286         }
3287 }
3288
3289 /*
3290  *      Routine:        vm_fault_copy
3291  *
3292  *      Purpose:
3293  *              Copy pages from one virtual memory object to another --
3294  *              neither the source nor destination pages need be resident.
3295  *
3296  *              Before actually copying a page, the version associated with
3297  *              the destination address map wil be verified.
3298  *
3299  *      In/out conditions:
3300  *              The caller must hold a reference, but not a lock, to
3301  *              each of the source and destination objects and to the
3302  *              destination map.
3303  *
3304  *      Results:
3305  *              Returns KERN_SUCCESS if no errors were encountered in
3306  *              reading or writing the data.  Returns KERN_INTERRUPTED if
3307  *              the operation was interrupted (only possible if the
3308  *              "interruptible" argument is asserted).  Other return values
3309  *              indicate a permanent error in copying the data.
3310  *
3311  *              The actual amount of data copied will be returned in the
3312  *              "copy_size" argument.  In the event that the destination map
3313  *              verification failed, this amount may be less than the amount
3314  *              requested.
3315  */
3316 kern_return_t
3317 vm_fault_copy(
3318         vm_object_t             src_object,
3319         vm_object_offset_t      src_offset,
3320         vm_size_t               *src_size,              /* INOUT */
3321         vm_object_t             dst_object,
3322         vm_object_offset_t      dst_offset,
3323         vm_map_t                dst_map,
3324         vm_map_version_t         *dst_version,
3325         int                     interruptible)
3326 {
3327         vm_page_t               result_page;
3328
3329         vm_page_t               src_page;
3330         vm_page_t               src_top_page;
3331         vm_prot_t               src_prot;
3332
3333         vm_page_t               dst_page;
3334         vm_page_t               dst_top_page;
3335         vm_prot_t               dst_prot;
3336
3337         vm_size_t               amount_left;
3338         vm_object_t             old_copy_object;
3339         kern_return_t           error = 0;
3340
3341         vm_size_t               part_size;
3342
3343         /*
3344          * In order not to confuse the clustered pageins, align
3345          * the different offsets on a page boundary.
3346          */
3347         vm_object_offset_t      src_lo_offset = trunc_page_64(src_offset);
3348         vm_object_offset_t      dst_lo_offset = trunc_page_64(dst_offset);
3349         vm_object_offset_t      src_hi_offset = round_page_64(src_offset + *src_size);
3350         vm_object_offset_t      dst_hi_offset = round_page_64(dst_offset + *src_size);
3351
3352 #define RETURN(x)                                       \
3353         MACRO_BEGIN                                     \
3354         *src_size -= amount_left;                       \
3355         MACRO_RETURN(x);                                \
3356         MACRO_END
3357
3358         amount_left = *src_size;
3359         do { /* while (amount_left > 0) */
3360                 /*
3361                  * There may be a deadlock if both source and destination
3362                  * pages are the same. To avoid this deadlock, the copy must
3363                  * start by getting the destination page in order to apply
3364                  * COW semantics if any.
3365                  */
3366
3367         RetryDestinationFault: ;
3368
3369                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3370
3371                 vm_object_lock(dst_object);
3372                 vm_object_paging_begin(dst_object);
3373
3374                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3375                 switch (vm_fault_page(dst_object,
3376                                       trunc_page_64(dst_offset),
3377                                       VM_PROT_WRITE|VM_PROT_READ,
3378                                       FALSE,
3379                                       interruptible,
3380                                       dst_lo_offset,
3381                                       dst_hi_offset,
3382                                       VM_BEHAVIOR_SEQUENTIAL,
3383                                       &dst_prot,
3384                                       &dst_page,
3385                                       &dst_top_page,
3386                                       (int *)0,
3387                                       &error,
3388                                       dst_map->no_zero_fill,
3389                                       FALSE, NULL, 0)) {
3390                 case VM_FAULT_SUCCESS:
3391                         break;
3392                 case VM_FAULT_RETRY:
3393                         goto RetryDestinationFault;
3394                 case VM_FAULT_MEMORY_SHORTAGE:
3395                         if (vm_page_wait(interruptible))
3396                                 goto RetryDestinationFault;
3397                         /* fall thru */
3398                 case VM_FAULT_INTERRUPTED:
3399                         RETURN(MACH_SEND_INTERRUPTED);
3400                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3401                         vm_page_more_fictitious();
3402                         goto RetryDestinationFault;
3403                 case VM_FAULT_MEMORY_ERROR:
3404                         if (error)
3405                                 return (error);
3406                         else
3407                                 return(KERN_MEMORY_ERROR);
3408                 }
3409                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3410
3411                 old_copy_object = dst_page->object->copy;
3412
3413                 /*
3414                  * There exists the possiblity that the source and
3415                  * destination page are the same.  But we can't
3416                  * easily determine that now.  If they are the
3417                  * same, the call to vm_fault_page() for the
3418                  * destination page will deadlock.  To prevent this we
3419                  * wire the page so we can drop busy without having
3420                  * the page daemon steal the page.  We clean up the
3421                  * top page  but keep the paging reference on the object
3422                  * holding the dest page so it doesn't go away.
3423                  */
3424
3425                 vm_page_lock_queues();
3426                 vm_page_wire(dst_page);
3427                 vm_page_unlock_queues();
3428                 PAGE_WAKEUP_DONE(dst_page);
3429                 vm_object_unlock(dst_page->object);
3430
3431                 if (dst_top_page != VM_PAGE_NULL) {
3432                         vm_object_lock(dst_object);
3433                         VM_PAGE_FREE(dst_top_page);
3434                         vm_object_paging_end(dst_object);
3435                         vm_object_unlock(dst_object);
3436                 }
3437
3438         RetrySourceFault: ;
3439
3440                 if (src_object == VM_OBJECT_NULL) {
3441                         /*
3442                          *      No source object.  We will just
3443                          *      zero-fill the page in dst_object.
3444                          */
3445                         src_page = VM_PAGE_NULL;
3446                         result_page = VM_PAGE_NULL;
3447                 } else {
3448                         vm_object_lock(src_object);
3449                         src_page = vm_page_lookup(src_object,
3450                                                   trunc_page_64(src_offset));
3451                         if (src_page == dst_page) {
3452                                 src_prot = dst_prot;
3453                                 result_page = VM_PAGE_NULL;
3454                         } else {
3455                                 src_prot = VM_PROT_READ;
3456                                 vm_object_paging_begin(src_object);
3457
3458                                 XPR(XPR_VM_FAULT,
3459                                         "vm_fault_copy(2) -> vm_fault_page\n",
3460                                         0,0,0,0,0);
3461                                 switch (vm_fault_page(src_object,
3462                                                       trunc_page_64(src_offset),
3463                                                       VM_PROT_READ,
3464                                                       FALSE,
3465                                                       interruptible,
3466                                                       src_lo_offset,
3467                                                       src_hi_offset,
3468                                                       VM_BEHAVIOR_SEQUENTIAL,
3469                                                       &src_prot,
3470                                                       &result_page,
3471                                                       &src_top_page,
3472                                                       (int *)0,
3473                                                       &error,
3474                                                       FALSE,
3475                                                       FALSE, NULL, 0)) {
3476
3477                                 case VM_FAULT_SUCCESS:
3478                                         break;
3479                                 case VM_FAULT_RETRY:
3480                                         goto RetrySourceFault;
3481                                 case VM_FAULT_MEMORY_SHORTAGE:
3482                                         if (vm_page_wait(interruptible))
3483                                                 goto RetrySourceFault;
3484                                         /* fall thru */
3485                                 case VM_FAULT_INTERRUPTED:
3486                                         vm_fault_copy_dst_cleanup(dst_page);
3487                                         RETURN(MACH_SEND_INTERRUPTED);
3488                                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3489                                         vm_page_more_fictitious();
3490                                         goto RetrySourceFault;
3491                                 case VM_FAULT_MEMORY_ERROR:
3492                                         vm_fault_copy_dst_cleanup(dst_page);
3493                                         if (error)
3494                                                 return (error);
3495                                         else
3496                                                 return(KERN_MEMORY_ERROR);
3497                                 }
3498
3499
3500                                 assert((src_top_page == VM_PAGE_NULL) ==
3501                                        (result_page->object == src_object));
3502                         }
3503                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3504                         vm_object_unlock(result_page->object);
3505                 }
3506
3507                 if (!vm_map_verify(dst_map, dst_version)) {
3508                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3509                                 vm_fault_copy_cleanup(result_page, src_top_page);
3510                         vm_fault_copy_dst_cleanup(dst_page);
3511                         break;
3512                 }
3513
3514                 vm_object_lock(dst_page->object);
3515
3516                 if (dst_page->object->copy != old_copy_object) {
3517                         vm_object_unlock(dst_page->object);
3518                         vm_map_verify_done(dst_map, dst_version);
3519                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3520                                 vm_fault_copy_cleanup(result_page, src_top_page);
3521                         vm_fault_copy_dst_cleanup(dst_page);
3522                         break;
3523                 }
3524                 vm_object_unlock(dst_page->object);
3525
3526                 /*
3527                  *      Copy the page, and note that it is dirty
3528                  *      immediately.
3529                  */
3530
3531                 if (!page_aligned(src_offset) ||
3532                         !page_aligned(dst_offset) ||
3533                         !page_aligned(amount_left)) {
3534
3535                         vm_object_offset_t      src_po,
3536                                                 dst_po;
3537
3538                         src_po = src_offset - trunc_page_64(src_offset);
3539                         dst_po = dst_offset - trunc_page_64(dst_offset);
3540
3541                         if (dst_po > src_po) {
3542                                 part_size = PAGE_SIZE - dst_po;
3543                         } else {
3544                                 part_size = PAGE_SIZE - src_po;
3545                         }
3546                         if (part_size > (amount_left)){
3547                                 part_size = amount_left;
3548                         }
3549
3550                         if (result_page == VM_PAGE_NULL) {
3551                                 vm_page_part_zero_fill(dst_page,
3552                                                         dst_po, part_size);
3553                         } else {
3554                                 vm_page_part_copy(result_page, src_po,
3555                                         dst_page, dst_po, part_size);
3556                                 if(!dst_page->dirty){
3557                                         vm_object_lock(dst_object);
3558                                         dst_page->dirty = TRUE;
3559                                         vm_object_unlock(dst_page->object);
3560                                 }
3561
3562                         }
3563                 } else {
3564                         part_size = PAGE_SIZE;
3565
3566                         if (result_page == VM_PAGE_NULL)
3567                                 vm_page_zero_fill(dst_page);
3568                         else{
3569                                 vm_page_copy(result_page, dst_page);
3570                                 if(!dst_page->dirty){
3571                                         vm_object_lock(dst_object);
3572                                         dst_page->dirty = TRUE;
3573                                         vm_object_unlock(dst_page->object);
3574                                 }
3575                         }
3576
3577                 }
3578
3579                 /*
3580                  *      Unlock everything, and return
3581                  */
3582
3583                 vm_map_verify_done(dst_map, dst_version);
3584
3585                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3586                         vm_fault_copy_cleanup(result_page, src_top_page);
3587                 vm_fault_copy_dst_cleanup(dst_page);
3588
3589                 amount_left -= part_size;
3590                 src_offset += part_size;
3591                 dst_offset += part_size;
3592         } while (amount_left > 0);
3593
3594         RETURN(KERN_SUCCESS);
3595 #undef  RETURN
3596
3597         /*NOTREACHED*/
3598 }
3599
3600 #ifdef  notdef
3601
3602 /*
3603  *      Routine:        vm_fault_page_overwrite
3604  *
3605  *      Description:
3606  *              A form of vm_fault_page that assumes that the
3607  *              resulting page will be overwritten in its entirety,
3608  *              making it unnecessary to obtain the correct *contents*
3609  *              of the page.
3610  *
3611  *      Implementation:
3612  *              XXX Untested.  Also unused.  Eventually, this technology
3613  *              could be used in vm_fault_copy() to advantage.
3614  */
3615 vm_fault_return_t
3616 vm_fault_page_overwrite(
3617         register
3618         vm_object_t             dst_object,
3619         vm_object_offset_t      dst_offset,
3620         vm_page_t               *result_page)   /* OUT */
3621 {
3622         register
3623         vm_page_t       dst_page;
3624         kern_return_t   wait_result;
3625
3626 #define interruptible   THREAD_UNINT    /* XXX */
3627
3628         while (TRUE) {
3629                 /*
3630                  *      Look for a page at this offset
3631                  */
3632
3633                 while ((dst_page = vm_page_lookup(dst_object, dst_offset))
3634                                  == VM_PAGE_NULL) {
3635                         /*
3636                          *      No page, no problem... just allocate one.
3637                          */
3638
3639                         dst_page = vm_page_alloc(dst_object, dst_offset);
3640                         if (dst_page == VM_PAGE_NULL) {
3641                                 vm_object_unlock(dst_object);
3642                                 VM_PAGE_WAIT();
3643                                 vm_object_lock(dst_object);
3644                                 continue;
3645                         }
3646
3647                         /*
3648                          *      Pretend that the memory manager
3649                          *      write-protected the page.
3650                          *
3651                          *      Note that we will be asking for write
3652                          *      permission without asking for the data
3653                          *      first.
3654                          */
3655
3656                         dst_page->overwriting = TRUE;
3657                         dst_page->page_lock = VM_PROT_WRITE;
3658                         dst_page->absent = TRUE;
3659                         dst_page->unusual = TRUE;
3660                         dst_object->absent_count++;
3661
3662                         break;
3663
3664                         /*
3665                          *      When we bail out, we might have to throw
3666                          *      away the page created here.
3667                          */
3668
3669 #define DISCARD_PAGE                                            \
3670         MACRO_BEGIN                                             \
3671         vm_object_lock(dst_object);                             \
3672         dst_page = vm_page_lookup(dst_object, dst_offset);      \
3673         if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \
3674                 VM_PAGE_FREE(dst_page);                         \
3675         vm_object_unlock(dst_object);                           \
3676         MACRO_END
3677                 }
3678
3679                 /*
3680                  *      If the page is write-protected...
3681                  */
3682
3683                 if (dst_page->page_lock & VM_PROT_WRITE) {
3684                         /*
3685                          *      ... and an unlock request hasn't been sent
3686                          */
3687
3688                         if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) {
3689                                 vm_prot_t       u;
3690                                 kern_return_t   rc;
3691
3692                                 /*
3693                                  *      ... then send one now.
3694                                  */
3695
3696                                 if (!dst_object->pager_ready) {
3697                                         wait_result = vm_object_assert_wait(dst_object,
3698                                                                 VM_OBJECT_EVENT_PAGER_READY,
3699                                                                 interruptible);
3700                                         vm_object_unlock(dst_object);
3701                                         if (wait_result == THREAD_WAITING)
3702                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
3703                                         if (wait_result != THREAD_AWAKENED) {
3704                                                 DISCARD_PAGE;
3705                                                 return(VM_FAULT_INTERRUPTED);
3706                                         }
3707                                         continue;
3708                                 }
3709
3710                                 u = dst_page->unlock_request |= VM_PROT_WRITE;
3711                                 vm_object_unlock(dst_object);
3712
3713                                 if ((rc = memory_object_data_unlock(
3714                                                 dst_object->pager,
3715                                                 dst_offset + dst_object->paging_offset,
3716                                                 PAGE_SIZE,
3717                                                 u)) != KERN_SUCCESS) {
3718                                         if (vm_fault_debug)
3719                                             printf("vm_object_overwrite: memory_object_data_unlock failed\n");
3720                                         DISCARD_PAGE;
3721                                         return((rc == MACH_SEND_INTERRUPTED) ?
3722                                                 VM_FAULT_INTERRUPTED :
3723                                                 VM_FAULT_MEMORY_ERROR);
3724                                 }
3725                                 vm_object_lock(dst_object);
3726                                 continue;
3727                         }
3728
3729                         /* ... fall through to wait below */
3730                 } else {
3731                         /*
3732                          *      If the page isn't being used for other
3733                          *      purposes, then we're done.
3734                          */
3735                         if ( ! (dst_page->busy || dst_page->absent ||
3736                                 dst_page->error || dst_page->restart) )
3737                                 break;
3738                 }
3739
3740                 wait_result = PAGE_ASSERT_WAIT(dst_page, interruptible);
3741                 vm_object_unlock(dst_object);
3742                 if (wait_result == THREAD_WAITING)
3743                         wait_result = thread_block(THREAD_CONTINUE_NULL);
3744                 if (wait_result != THREAD_AWAKENED) {
3745                         DISCARD_PAGE;
3746                         return(VM_FAULT_INTERRUPTED);
3747                 }
3748         }
3749
3750         *result_page = dst_page;
3751         return(VM_FAULT_SUCCESS);
3752
3753 #undef  interruptible
3754 #undef  DISCARD_PAGE
3755 }
3756
3757 #endif  /* notdef */
3758
3759 #if     VM_FAULT_CLASSIFY
3760 /*
3761  *      Temporary statistics gathering support.
3762  */
3763
3764 /*
3765  *      Statistics arrays:
3766  */
3767 #define VM_FAULT_TYPES_MAX      5
3768 #define VM_FAULT_LEVEL_MAX      8
3769
3770 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
3771
3772 #define VM_FAULT_TYPE_ZERO_FILL 0
3773 #define VM_FAULT_TYPE_MAP_IN    1
3774 #define VM_FAULT_TYPE_PAGER     2
3775 #define VM_FAULT_TYPE_COPY      3
3776 #define VM_FAULT_TYPE_OTHER     4
3777
3778
3779 void
3780 vm_fault_classify(vm_object_t           object,
3781                   vm_object_offset_t    offset,
3782                   vm_prot_t             fault_type)
3783 {
3784         int             type, level = 0;
3785         vm_page_t       m;
3786
3787         while (TRUE) {
3788                 m = vm_page_lookup(object, offset);
3789                 if (m != VM_PAGE_NULL) {
3790                         if (m->busy || m->error || m->restart || m->absent ||
3791                             fault_type & m->page_lock) {
3792                                 type = VM_FAULT_TYPE_OTHER;
3793                                 break;
3794                         }
3795                         if (((fault_type & VM_PROT_WRITE) == 0) ||
3796                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
3797                                 type = VM_FAULT_TYPE_MAP_IN;
3798                                 break;
3799                         }
3800                         type = VM_FAULT_TYPE_COPY;
3801                         break;
3802                 }
3803                 else {
3804                         if (object->pager_created) {
3805                                 type = VM_FAULT_TYPE_PAGER;
3806                                 break;
3807                         }
3808                         if (object->shadow == VM_OBJECT_NULL) {
3809                                 type = VM_FAULT_TYPE_ZERO_FILL;
3810                                 break;
3811                         }
3812
3813                         offset += object->shadow_offset;
3814                         object = object->shadow;
3815                         level++;
3816                         continue;
3817                 }
3818         }
3819
3820         if (level > VM_FAULT_LEVEL_MAX)
3821                 level = VM_FAULT_LEVEL_MAX;
3822
3823         vm_fault_stats[type][level] += 1;
3824
3825         return;
3826 }
3827
3828 /* cleanup routine to call from debugger */
3829
3830 void
3831 vm_fault_classify_init(void)
3832 {
3833         int type, level;
3834
3835         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
3836                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
3837                         vm_fault_stats[type][level] = 0;
3838                 }
3839         }
3840
3841         return;
3842 }
3843 #endif  /* VM_FAULT_CLASSIFY */