osfmk/vm/vm_fault.c

   1
   2 /*
   3  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   4  *
   5  * @APPLE_LICENSE_HEADER_START@
   6  *
   7  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   8  *
   9  * This file contains Original Code and/or Modifications of Original Code
  10  * as defined in and that are subject to the Apple Public Source License
  11  * Version 2.0 (the 'License'). You may not use this file except in
  12  * compliance with the License. Please obtain a copy of the License at
  13  * http://www.opensource.apple.com/apsl/ and read it before using this
  14  * file.
  15  *
  16  * The Original Code and all software distributed under the License are
  17  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  18  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  19  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  21  * Please see the License for the specific language governing rights and
  22  * limitations under the License.
  23  *
  24  * @APPLE_LICENSE_HEADER_END@
  25  */
  26 /*
  27  * @OSF_COPYRIGHT@
  28  */
  29 /*
  30  * Mach Operating System
  31  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  32  * All Rights Reserved.
  33  *
  34  * Permission to use, copy, modify and distribute this software and its
  35  * documentation is hereby granted, provided that both the copyright
  36  * notice and this permission notice appear in all copies of the
  37  * software, derivative works or modified versions, and any portions
  38  * thereof, and that both notices appear in supporting documentation.
  39  *
  40  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  41  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  42  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  43  *
  44  * Carnegie Mellon requests users of this software to return to
  45  *
  46  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  47  *  School of Computer Science
  48  *  Carnegie Mellon University
  49  *  Pittsburgh PA 15213-3890
  50  *
  51  * any improvements or extensions that they make and grant Carnegie Mellon
  52  * the rights to redistribute these changes.
  53  */
  54 /*
  55  */
  56 /*
  57  *      File:   vm_fault.c
  58  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  59  *
  60  *      Page fault handling module.
  61  */
  62 #ifdef MACH_BSD
  63 /* remove after component interface available */
  64 extern int      vnode_pager_workaround;
  65 extern int      device_pager_workaround;
  66 #endif
  67
  68 #include <mach_cluster_stats.h>
  69 #include <mach_pagemap.h>
  70 #include <mach_kdb.h>
  71
  72 #include <vm/vm_fault.h>
  73 #include <mach/kern_return.h>
  74 #include <mach/message.h>       /* for error codes */
  75 #include <kern/host_statistics.h>
  76 #include <kern/counters.h>
  77 #include <kern/task.h>
  78 #include <kern/thread.h>
  79 #include <kern/sched_prim.h>
  80 #include <kern/host.h>
  81 #include <kern/xpr.h>
  82 #include <ppc/proc_reg.h>
  83 #include <ppc/pmap_internals.h>
  84 #include <vm/task_working_set.h>
  85 #include <vm/vm_map.h>
  86 #include <vm/vm_object.h>
  87 #include <vm/vm_page.h>
  88 #include <vm/pmap.h>
  89 #include <vm/vm_pageout.h>
  90 #include <mach/vm_param.h>
  91 #include <mach/vm_behavior.h>
  92 #include <mach/memory_object.h>
  93                                 /* For memory_object_data_{request,unlock} */
  94 #include <kern/mach_param.h>
  95 #include <kern/macro_help.h>
  96 #include <kern/zalloc.h>
  97 #include <kern/misc_protos.h>
  98
  99 #include <sys/kdebug.h>
 100
 101 #define VM_FAULT_CLASSIFY       0
 102 #define VM_FAULT_STATIC_CONFIG  1
 103
 104 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 105
 106 int             vm_object_absent_max = 50;
 107
 108 int             vm_fault_debug = 0;
 109 boolean_t       vm_page_deactivate_behind = TRUE;
 110
 111
 112 #if     !VM_FAULT_STATIC_CONFIG
 113 boolean_t       vm_fault_dirty_handling = FALSE;
 114 boolean_t       vm_fault_interruptible = FALSE;
 115 boolean_t       software_reference_bits = TRUE;
 116 #endif
 117
 118 #if     MACH_KDB
 119 extern struct db_watchpoint *db_watchpoint_list;
 120 #endif  /* MACH_KDB */
 121
 122 /* Forward declarations of internal routines. */
 123 extern kern_return_t vm_fault_wire_fast(
 124                                 vm_map_t        map,
 125                                 vm_offset_t     va,
 126                                 vm_map_entry_t  entry,
 127                                 pmap_t          pmap,
 128                                 vm_offset_t     pmap_addr);
 129
 130 extern void vm_fault_continue(void);
 131
 132 extern void vm_fault_copy_cleanup(
 133                                 vm_page_t       page,
 134                                 vm_page_t       top_page);
 135
 136 extern void vm_fault_copy_dst_cleanup(
 137                                 vm_page_t       page);
 138
 139 #if     VM_FAULT_CLASSIFY
 140 extern void vm_fault_classify(vm_object_t       object,
 141                           vm_object_offset_t    offset,
 142                           vm_prot_t             fault_type);
 143
 144 extern void vm_fault_classify_init(void);
 145 #endif
 146
 147 /*
 148  *      Routine:        vm_fault_init
 149  *      Purpose:
 150  *              Initialize our private data structures.
 151  */
 152 void
 153 vm_fault_init(void)
 154 {
 155 }
 156
 157 /*
 158  *      Routine:        vm_fault_cleanup
 159  *      Purpose:
 160  *              Clean up the result of vm_fault_page.
 161  *      Results:
 162  *              The paging reference for "object" is released.
 163  *              "object" is unlocked.
 164  *              If "top_page" is not null,  "top_page" is
 165  *              freed and the paging reference for the object
 166  *              containing it is released.
 167  *
 168  *      In/out conditions:
 169  *              "object" must be locked.
 170  */
 171 void
 172 vm_fault_cleanup(
 173         register vm_object_t    object,
 174         register vm_page_t      top_page)
 175 {
 176         vm_object_paging_end(object);
 177         vm_object_unlock(object);
 178
 179         if (top_page != VM_PAGE_NULL) {
 180             object = top_page->object;
 181             vm_object_lock(object);
 182             VM_PAGE_FREE(top_page);
 183             vm_object_paging_end(object);
 184             vm_object_unlock(object);
 185         }
 186 }
 187
 188 #if     MACH_CLUSTER_STATS
 189 #define MAXCLUSTERPAGES 16
 190 struct {
 191         unsigned long pages_in_cluster;
 192         unsigned long pages_at_higher_offsets;
 193         unsigned long pages_at_lower_offsets;
 194 } cluster_stats_in[MAXCLUSTERPAGES];
 195 #define CLUSTER_STAT(clause)    clause
 196 #define CLUSTER_STAT_HIGHER(x)  \
 197         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 198 #define CLUSTER_STAT_LOWER(x)   \
 199          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 200 #define CLUSTER_STAT_CLUSTER(x) \
 201         ((cluster_stats_in[(x)].pages_in_cluster)++)
 202 #else   /* MACH_CLUSTER_STATS */
 203 #define CLUSTER_STAT(clause)
 204 #endif  /* MACH_CLUSTER_STATS */
 205
 206 /* XXX - temporary */
 207 boolean_t vm_allow_clustered_pagein = FALSE;
 208 int vm_pagein_cluster_used = 0;
 209
 210 /*
 211  * Prepage default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 212  */
 213 int vm_default_ahead = 1;       /* Number of pages to prepage ahead */
 214 int vm_default_behind = 0;      /* Number of pages to prepage behind */
 215
 216 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 217
 218 /*
 219  *      Routine:        vm_fault_page
 220  *      Purpose:
 221  *              Find the resident page for the virtual memory
 222  *              specified by the given virtual memory object
 223  *              and offset.
 224  *      Additional arguments:
 225  *              The required permissions for the page is given
 226  *              in "fault_type".  Desired permissions are included
 227  *              in "protection".  The minimum and maximum valid offsets
 228  *              within the object for the relevant map entry are
 229  *              passed in "lo_offset" and "hi_offset" respectively and
 230  *              the expected page reference pattern is passed in "behavior".
 231  *              These three parameters are used to determine pagein cluster
 232  *              limits.
 233  *
 234  *              If the desired page is known to be resident (for
 235  *              example, because it was previously wired down), asserting
 236  *              the "unwiring" parameter will speed the search.
 237  *
 238  *              If the operation can be interrupted (by thread_abort
 239  *              or thread_terminate), then the "interruptible"
 240  *              parameter should be asserted.
 241  *
 242  *      Results:
 243  *              The page containing the proper data is returned
 244  *              in "result_page".
 245  *
 246  *      In/out conditions:
 247  *              The source object must be locked and referenced,
 248  *              and must donate one paging reference.  The reference
 249  *              is not affected.  The paging reference and lock are
 250  *              consumed.
 251  *
 252  *              If the call succeeds, the object in which "result_page"
 253  *              resides is left locked and holding a paging reference.
 254  *              If this is not the original object, a busy page in the
 255  *              original object is returned in "top_page", to prevent other
 256  *              callers from pursuing this same data, along with a paging
 257  *              reference for the original object.  The "top_page" should
 258  *              be destroyed when this guarantee is no longer required.
 259  *              The "result_page" is also left busy.  It is not removed
 260  *              from the pageout queues.
 261  */
 262
 263 vm_fault_return_t
 264 vm_fault_page(
 265         /* Arguments: */
 266         vm_object_t     first_object,   /* Object to begin search */
 267         vm_object_offset_t first_offset,        /* Offset into object */
 268         vm_prot_t       fault_type,     /* What access is requested */
 269         boolean_t       must_be_resident,/* Must page be resident? */
 270         int             interruptible,  /* how may fault be interrupted? */
 271         vm_object_offset_t lo_offset,   /* Map entry start */
 272         vm_object_offset_t hi_offset,   /* Map entry end */
 273         vm_behavior_t   behavior,       /* Page reference behavior */
 274         /* Modifies in place: */
 275         vm_prot_t       *protection,    /* Protection for mapping */
 276         /* Returns: */
 277         vm_page_t       *result_page,   /* Page found, if successful */
 278         vm_page_t       *top_page,      /* Page in top object, if
 279                                          * not result_page.  */
 280         int             *type_of_fault, /* if non-null, fill in with type of fault
 281                                          * COW, zero-fill, etc... returned in trace point */
 282         /* More arguments: */
 283         kern_return_t   *error_code,    /* code if page is in error */
 284         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 285         boolean_t       data_supply,    /* treat as data_supply if
 286                                          * it is a write fault and a full
 287                                          * page is provided */
 288         vm_map_t        map,
 289         vm_offset_t     vaddr)
 290 {
 291         register
 292         vm_page_t               m;
 293         register
 294         vm_object_t             object;
 295         register
 296         vm_object_offset_t      offset;
 297         vm_page_t               first_m;
 298         vm_object_t             next_object;
 299         vm_object_t             copy_object;
 300         boolean_t               look_for_page;
 301         vm_prot_t               access_required = fault_type;
 302         vm_prot_t               wants_copy_flag;
 303         vm_size_t               cluster_size, length;
 304         vm_object_offset_t      cluster_offset;
 305         vm_object_offset_t      cluster_start, cluster_end, paging_offset;
 306         vm_object_offset_t      align_offset;
 307         CLUSTER_STAT(int pages_at_higher_offsets;)
 308         CLUSTER_STAT(int pages_at_lower_offsets;)
 309         kern_return_t   wait_result;
 310         boolean_t               interruptible_state;
 311         boolean_t               bumped_pagein = FALSE;
 312
 313
 314 #if     MACH_PAGEMAP
 315 /*
 316  * MACH page map - an optional optimization where a bit map is maintained
 317  * by the VM subsystem for internal objects to indicate which pages of
 318  * the object currently reside on backing store.  This existence map
 319  * duplicates information maintained by the vnode pager.  It is
 320  * created at the time of the first pageout against the object, i.e.
 321  * at the same time pager for the object is created.  The optimization
 322  * is designed to eliminate pager interaction overhead, if it is
 323  * 'known' that the page does not exist on backing store.
 324  *
 325  * LOOK_FOR() evaluates to TRUE if the page specified by object/offset is
 326  * either marked as paged out in the existence map for the object or no
 327  * existence map exists for the object.  LOOK_FOR() is one of the
 328  * criteria in the decision to invoke the pager.   It is also used as one
 329  * of the criteria to terminate the scan for adjacent pages in a clustered
 330  * pagein operation.  Note that LOOK_FOR() always evaluates to TRUE for
 331  * permanent objects.  Note also that if the pager for an internal object
 332  * has not been created, the pager is not invoked regardless of the value
 333  * of LOOK_FOR() and that clustered pagein scans are only done on an object
 334  * for which a pager has been created.
 335  *
 336  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 337  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 338  * PAGED_OUT() is used to determine if a page has already been pushed
 339  * into a copy object in order to avoid a redundant page out operation.
 340  */
 341 #define LOOK_FOR(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 342                         != VM_EXTERNAL_STATE_ABSENT)
 343 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 344                         == VM_EXTERNAL_STATE_EXISTS)
 345 #else /* MACH_PAGEMAP */
 346 /*
 347  * If the MACH page map optimization is not enabled,
 348  * LOOK_FOR() always evaluates to TRUE.  The pager will always be
 349  * invoked to resolve missing pages in an object, assuming the pager
 350  * has been created for the object.  In a clustered page operation, the
 351  * absence of a page on backing backing store cannot be used to terminate
 352  * a scan for adjacent pages since that information is available only in
 353  * the pager.  Hence pages that may not be paged out are potentially
 354  * included in a clustered request.  The vnode pager is coded to deal
 355  * with any combination of absent/present pages in a clustered
 356  * pagein request.  PAGED_OUT() always evaluates to FALSE, i.e. the pager
 357  * will always be invoked to push a dirty page into a copy object assuming
 358  * a pager has been created.  If the page has already been pushed, the
 359  * pager will ingore the new request.
 360  */
 361 #define LOOK_FOR(o, f) TRUE
 362 #define PAGED_OUT(o, f) FALSE
 363 #endif /* MACH_PAGEMAP */
 364
 365 /*
 366  *      Recovery actions
 367  */
 368 #define PREPARE_RELEASE_PAGE(m)                         \
 369         MACRO_BEGIN                                     \
 370         vm_page_lock_queues();                          \
 371         MACRO_END
 372
 373 #define DO_RELEASE_PAGE(m)                              \
 374         MACRO_BEGIN                                     \
 375         PAGE_WAKEUP_DONE(m);                            \
 376         if (!m->active && !m->inactive)                 \
 377                 vm_page_activate(m);                    \
 378         vm_page_unlock_queues();                        \
 379         MACRO_END
 380
 381 #define RELEASE_PAGE(m)                                 \
 382         MACRO_BEGIN                                     \
 383         PREPARE_RELEASE_PAGE(m);                        \
 384         DO_RELEASE_PAGE(m);                             \
 385         MACRO_END
 386
 387 #if TRACEFAULTPAGE
 388         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 389 #endif
 390
 391
 392
 393 #if     !VM_FAULT_STATIC_CONFIG
 394         if (vm_fault_dirty_handling
 395 #if     MACH_KDB
 396                 /*
 397                  *      If there are watchpoints set, then
 398                  *      we don't want to give away write permission
 399                  *      on a read fault.  Make the task write fault,
 400                  *      so that the watchpoint code notices the access.
 401                  */
 402             || db_watchpoint_list
 403 #endif  /* MACH_KDB */
 404             ) {
 405                 /*
 406                  *      If we aren't asking for write permission,
 407                  *      then don't give it away.  We're using write
 408                  *      faults to set the dirty bit.
 409                  */
 410                 if (!(fault_type & VM_PROT_WRITE))
 411                         *protection &= ~VM_PROT_WRITE;
 412         }
 413
 414         if (!vm_fault_interruptible)
 415                 interruptible = THREAD_UNINT;
 416 #else   /* STATIC_CONFIG */
 417 #if     MACH_KDB
 418                 /*
 419                  *      If there are watchpoints set, then
 420                  *      we don't want to give away write permission
 421                  *      on a read fault.  Make the task write fault,
 422                  *      so that the watchpoint code notices the access.
 423                  */
 424             if (db_watchpoint_list) {
 425                 /*
 426                  *      If we aren't asking for write permission,
 427                  *      then don't give it away.  We're using write
 428                  *      faults to set the dirty bit.
 429                  */
 430                 if (!(fault_type & VM_PROT_WRITE))
 431                         *protection &= ~VM_PROT_WRITE;
 432         }
 433
 434 #endif  /* MACH_KDB */
 435 #endif  /* STATIC_CONFIG */
 436
 437         interruptible_state = thread_interrupt_level(interruptible);
 438
 439         /*
 440          *      INVARIANTS (through entire routine):
 441          *
 442          *      1)      At all times, we must either have the object
 443          *              lock or a busy page in some object to prevent
 444          *              some other thread from trying to bring in
 445          *              the same page.
 446          *
 447          *              Note that we cannot hold any locks during the
 448          *              pager access or when waiting for memory, so
 449          *              we use a busy page then.
 450          *
 451          *              Note also that we aren't as concerned about more than
 452          *              one thread attempting to memory_object_data_unlock
 453          *              the same page at once, so we don't hold the page
 454          *              as busy then, but do record the highest unlock
 455          *              value so far.  [Unlock requests may also be delivered
 456          *              out of order.]
 457          *
 458          *      2)      To prevent another thread from racing us down the
 459          *              shadow chain and entering a new page in the top
 460          *              object before we do, we must keep a busy page in
 461          *              the top object while following the shadow chain.
 462          *
 463          *      3)      We must increment paging_in_progress on any object
 464          *              for which we have a busy page
 465          *
 466          *      4)      We leave busy pages on the pageout queues.
 467          *              If the pageout daemon comes across a busy page,
 468          *              it will remove the page from the pageout queues.
 469          */
 470
 471         /*
 472          *      Search for the page at object/offset.
 473          */
 474
 475         object = first_object;
 476         offset = first_offset;
 477         first_m = VM_PAGE_NULL;
 478         access_required = fault_type;
 479
 480         XPR(XPR_VM_FAULT,
 481                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 482                 (integer_t)object, offset, fault_type, *protection, 0);
 483
 484         /*
 485          *      See whether this page is resident
 486          */
 487
 488         while (TRUE) {
 489 #if TRACEFAULTPAGE
 490                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 491 #endif
 492                 if (!object->alive) {
 493                         vm_fault_cleanup(object, first_m);
 494                         thread_interrupt_level(interruptible_state);
 495                         return(VM_FAULT_MEMORY_ERROR);
 496                 }
 497                 m = vm_page_lookup(object, offset);
 498 #if TRACEFAULTPAGE
 499                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 500 #endif
 501                 if (m != VM_PAGE_NULL) {
 502                         /*
 503                          *      If the page was pre-paged as part of a
 504                          *      cluster, record the fact.
 505                          */
 506                         if (m->clustered) {
 507                                 vm_pagein_cluster_used++;
 508                                 m->clustered = FALSE;
 509                         }
 510
 511                         /*
 512                          *      If the page is being brought in,
 513                          *      wait for it and then retry.
 514                          *
 515                          *      A possible optimization: if the page
 516                          *      is known to be resident, we can ignore
 517                          *      pages that are absent (regardless of
 518                          *      whether they're busy).
 519                          */
 520
 521                         if (m->busy) {
 522 #if TRACEFAULTPAGE
 523                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 524 #endif
 525                                 wait_result = PAGE_SLEEP(object, m, interruptible);
 526                                 XPR(XPR_VM_FAULT,
 527                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 528                                         (integer_t)object, offset,
 529                                         (integer_t)m, 0, 0);
 530                                 counter(c_vm_fault_page_block_busy_kernel++);
 531
 532                                 if (wait_result != THREAD_AWAKENED) {
 533                                         vm_fault_cleanup(object, first_m);
 534                                         thread_interrupt_level(interruptible_state);
 535                                         if (wait_result == THREAD_RESTART)
 536                                           {
 537                                                 return(VM_FAULT_RETRY);
 538                                           }
 539                                         else
 540                                           {
 541                                                 return(VM_FAULT_INTERRUPTED);
 542                                           }
 543                                 }
 544                                 continue;
 545                         }
 546
 547                         /*
 548                          *      If the page is in error, give up now.
 549                          */
 550
 551                         if (m->error) {
 552 #if TRACEFAULTPAGE
 553                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
 554 #endif
 555                                 if (error_code)
 556                                         *error_code = m->page_error;
 557                                 VM_PAGE_FREE(m);
 558                                 vm_fault_cleanup(object, first_m);
 559                                 thread_interrupt_level(interruptible_state);
 560                                 return(VM_FAULT_MEMORY_ERROR);
 561                         }
 562
 563                         /*
 564                          *      If the pager wants us to restart
 565                          *      at the top of the chain,
 566                          *      typically because it has moved the
 567                          *      page to another pager, then do so.
 568                          */
 569
 570                         if (m->restart) {
 571 #if TRACEFAULTPAGE
 572                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 573 #endif
 574                                 VM_PAGE_FREE(m);
 575                                 vm_fault_cleanup(object, first_m);
 576                                 thread_interrupt_level(interruptible_state);
 577                                 return(VM_FAULT_RETRY);
 578                         }
 579
 580                         /*
 581                          *      If the page isn't busy, but is absent,
 582                          *      then it was deemed "unavailable".
 583                          */
 584
 585                         if (m->absent) {
 586                                 /*
 587                                  * Remove the non-existent page (unless it's
 588                                  * in the top object) and move on down to the
 589                                  * next object (if there is one).
 590                                  */
 591 #if TRACEFAULTPAGE
 592                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
 593 #endif
 594
 595                                 next_object = object->shadow;
 596                                 if (next_object == VM_OBJECT_NULL) {
 597                                         vm_page_t real_m;
 598
 599                                         assert(!must_be_resident);
 600
 601                                         if (object->shadow_severed) {
 602                                                 vm_fault_cleanup(
 603                                                         object, first_m);
 604                                                 thread_interrupt_level(interruptible_state);
 605                                                 return VM_FAULT_MEMORY_ERROR;
 606                                         }
 607
 608                                         /*
 609                                          * Absent page at bottom of shadow
 610                                          * chain; zero fill the page we left
 611                                          * busy in the first object, and flush
 612                                          * the absent page.  But first we
 613                                          * need to allocate a real page.
 614                                          */
 615                                         if (VM_PAGE_THROTTLED() ||
 616                                             (real_m = vm_page_grab()) == VM_PAGE_NULL) {
 617                                                 vm_fault_cleanup(object, first_m);
 618                                                 thread_interrupt_level(interruptible_state);
 619                                                 return(VM_FAULT_MEMORY_SHORTAGE);
 620                                         }
 621
 622                                         XPR(XPR_VM_FAULT,
 623               "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
 624                                                 (integer_t)object, offset,
 625                                                 (integer_t)m,
 626                                                 (integer_t)first_object, 0);
 627                                         if (object != first_object) {
 628                                                 VM_PAGE_FREE(m);
 629                                                 vm_object_paging_end(object);
 630                                                 vm_object_unlock(object);
 631                                                 object = first_object;
 632                                                 offset = first_offset;
 633                                                 m = first_m;
 634                                                 first_m = VM_PAGE_NULL;
 635                                                 vm_object_lock(object);
 636                                         }
 637
 638                                         VM_PAGE_FREE(m);
 639                                         assert(real_m->busy);
 640                                         vm_page_insert(real_m, object, offset);
 641                                         m = real_m;
 642
 643                                         /*
 644                                          *  Drop the lock while zero filling
 645                                          *  page.  Then break because this
 646                                          *  is the page we wanted.  Checking
 647                                          *  the page lock is a waste of time;
 648                                          *  this page was either absent or
 649                                          *  newly allocated -- in both cases
 650                                          *  it can't be page locked by a pager.
 651                                          */
 652                                         m->no_isync = FALSE;
 653
 654                                         if (!no_zero_fill) {
 655                                                 vm_object_unlock(object);
 656                                                 vm_page_zero_fill(m);
 657                                                 if (type_of_fault)
 658                                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
 659                                                 VM_STAT(zero_fill_count++);
 660
 661                                                 if (bumped_pagein == TRUE) {
 662                                                         VM_STAT(pageins--);
 663                                                         current_task()->pageins--;
 664                                                 }
 665                                                 vm_object_lock(object);
 666                                         }
 667                                         pmap_clear_modify(m->phys_addr);
 668                                         vm_page_lock_queues();
 669                                         VM_PAGE_QUEUES_REMOVE(m);
 670                                         m->page_ticket = vm_page_ticket;
 671                                         if(m->object->size > 0x80000) {
 672                                                 m->zero_fill = TRUE;
 673                                                 /* depends on the queues lock */
 674                                                 vm_zf_count += 1;
 675                                                 queue_enter(&vm_page_queue_zf,
 676                                                         m, vm_page_t, pageq);
 677                                         } else {
 678                                                 queue_enter(
 679                                                         &vm_page_queue_inactive,
 680                                                         m, vm_page_t, pageq);
 681                                         }
 682                                         vm_page_ticket_roll++;
 683                                         if(vm_page_ticket_roll ==
 684                                                 VM_PAGE_TICKETS_IN_ROLL) {
 685                                                 vm_page_ticket_roll = 0;
 686                                                 if(vm_page_ticket ==
 687                                                      VM_PAGE_TICKET_ROLL_IDS)
 688                                                         vm_page_ticket= 0;
 689                                                 else
 690                                                         vm_page_ticket++;
 691                                         }
 692                                         m->inactive = TRUE;
 693                                         vm_page_inactive_count++;
 694                                         vm_page_unlock_queues();
 695                                         break;
 696                                 } else {
 697                                         if (must_be_resident) {
 698                                                 vm_object_paging_end(object);
 699                                         } else if (object != first_object) {
 700                                                 vm_object_paging_end(object);
 701                                                 VM_PAGE_FREE(m);
 702                                         } else {
 703                                                 first_m = m;
 704                                                 m->absent = FALSE;
 705                                                 m->unusual = FALSE;
 706                                                 vm_object_absent_release(object);
 707                                                 m->busy = TRUE;
 708
 709                                                 vm_page_lock_queues();
 710                                                 VM_PAGE_QUEUES_REMOVE(m);
 711                                                 vm_page_unlock_queues();
 712                                         }
 713                                         XPR(XPR_VM_FAULT,
 714                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
 715                                                 (integer_t)object, offset,
 716                                                 (integer_t)next_object,
 717                                                 offset+object->shadow_offset,0);
 718                                         offset += object->shadow_offset;
 719                                         hi_offset += object->shadow_offset;
 720                                         lo_offset += object->shadow_offset;
 721                                         access_required = VM_PROT_READ;
 722                                         vm_object_lock(next_object);
 723                                         vm_object_unlock(object);
 724                                         object = next_object;
 725                                         vm_object_paging_begin(object);
 726                                         continue;
 727                                 }
 728                         }
 729
 730                         if ((m->cleaning)
 731                                 && ((object != first_object) ||
 732                                     (object->copy != VM_OBJECT_NULL))
 733                                 && (fault_type & VM_PROT_WRITE)) {
 734                                 /*
 735                                  * This is a copy-on-write fault that will
 736                                  * cause us to revoke access to this page, but
 737                                  * this page is in the process of being cleaned
 738                                  * in a clustered pageout. We must wait until
 739                                  * the cleaning operation completes before
 740                                  * revoking access to the original page,
 741                                  * otherwise we might attempt to remove a
 742                                  * wired mapping.
 743                                  */
 744 #if TRACEFAULTPAGE
 745                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
 746 #endif
 747                                 XPR(XPR_VM_FAULT,
 748                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
 749                                         (integer_t)object, offset,
 750                                         (integer_t)m, 0, 0);
 751                                 /* take an extra ref so that object won't die */
 752                                 assert(object->ref_count > 0);
 753                                 object->ref_count++;
 754                                 vm_object_res_reference(object);
 755                                 vm_fault_cleanup(object, first_m);
 756                                 counter(c_vm_fault_page_block_backoff_kernel++);
 757                                 vm_object_lock(object);
 758                                 assert(object->ref_count > 0);
 759                                 m = vm_page_lookup(object, offset);
 760                                 if (m != VM_PAGE_NULL && m->cleaning) {
 761                                         PAGE_ASSERT_WAIT(m, interruptible);
 762                                         vm_object_unlock(object);
 763                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
 764                                         vm_object_deallocate(object);
 765                                         goto backoff;
 766                                 } else {
 767                                         vm_object_unlock(object);
 768                                         vm_object_deallocate(object);
 769                                         thread_interrupt_level(interruptible_state);
 770                                         return VM_FAULT_RETRY;
 771                                 }
 772                         }
 773
 774                         /*
 775                          *      If the desired access to this page has
 776                          *      been locked out, request that it be unlocked.
 777                          */
 778
 779                         if (access_required & m->page_lock) {
 780                                 if ((access_required & m->unlock_request) != access_required) {
 781                                         vm_prot_t       new_unlock_request;
 782                                         kern_return_t   rc;
 783
 784 #if TRACEFAULTPAGE
 785                                         dbgTrace(0xBEEF000A, (unsigned int) m, (unsigned int) object->pager_ready);     /* (TEST/DEBUG) */
 786 #endif
 787                                         if (!object->pager_ready) {
 788                                         XPR(XPR_VM_FAULT,
 789                                             "vm_f_page: ready wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 790                                                 access_required,
 791                                                 (integer_t)object, offset,
 792                                                 (integer_t)m, 0);
 793                                                 /* take an extra ref */
 794                                                 assert(object->ref_count > 0);
 795                                                 object->ref_count++;
 796                                                 vm_object_res_reference(object);
 797                                                 vm_fault_cleanup(object,
 798                                                                  first_m);
 799                                                 counter(c_vm_fault_page_block_backoff_kernel++);
 800                                                 vm_object_lock(object);
 801                                                 assert(object->ref_count > 0);
 802                                                 if (!object->pager_ready) {
 803                                                         wait_result = vm_object_assert_wait(
 804                                                                 object,
 805                                                                 VM_OBJECT_EVENT_PAGER_READY,
 806                                                                 interruptible);
 807                                                         vm_object_unlock(object);
 808                                                         if (wait_result == THREAD_WAITING)
 809                                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
 810                                                         vm_object_deallocate(object);
 811                                                         goto backoff;
 812                                                 } else {
 813                                                         vm_object_unlock(object);
 814                                                         vm_object_deallocate(object);
 815                                                         thread_interrupt_level(interruptible_state);
 816                                                         return VM_FAULT_RETRY;
 817                                                 }
 818                                         }
 819
 820                                         new_unlock_request = m->unlock_request =
 821                                                 (access_required | m->unlock_request);
 822                                         vm_object_unlock(object);
 823                                         XPR(XPR_VM_FAULT,
 824                                             "vm_f_page: unlock obj 0x%X, offset 0x%X, page 0x%X, unl_req %d\n",
 825                                         (integer_t)object, offset,
 826                                         (integer_t)m, new_unlock_request, 0);
 827                                         if ((rc = memory_object_data_unlock(
 828                                                 object->pager,
 829                                                 offset + object->paging_offset,
 830                                                 PAGE_SIZE,
 831                                                 new_unlock_request))
 832                                              != KERN_SUCCESS) {
 833                                                 if (vm_fault_debug)
 834                                                     printf("vm_fault: memory_object_data_unlock failed\n");
 835                                                 vm_object_lock(object);
 836                                                 vm_fault_cleanup(object, first_m);
 837                                                 thread_interrupt_level(interruptible_state);
 838                                                 return((rc == MACH_SEND_INTERRUPTED) ?
 839                                                         VM_FAULT_INTERRUPTED :
 840                                                         VM_FAULT_MEMORY_ERROR);
 841                                         }
 842                                         vm_object_lock(object);
 843                                         continue;
 844                                 }
 845
 846                                 XPR(XPR_VM_FAULT,
 847         "vm_f_page: access wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 848                                         access_required, (integer_t)object,
 849                                         offset, (integer_t)m, 0);
 850                                 /* take an extra ref so object won't die */
 851                                 assert(object->ref_count > 0);
 852                                 object->ref_count++;
 853                                 vm_object_res_reference(object);
 854                                 vm_fault_cleanup(object, first_m);
 855                                 counter(c_vm_fault_page_block_backoff_kernel++);
 856                                 vm_object_lock(object);
 857                                 assert(object->ref_count > 0);
 858                                 m = vm_page_lookup(object, offset);
 859                                 if (m != VM_PAGE_NULL &&
 860                                     (access_required & m->page_lock) &&
 861                                     !((access_required & m->unlock_request) != access_required)) {
 862                                         PAGE_ASSERT_WAIT(m, interruptible);
 863                                         vm_object_unlock(object);
 864                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
 865                                         vm_object_deallocate(object);
 866                                         goto backoff;
 867                                 } else {
 868                                         vm_object_unlock(object);
 869                                         vm_object_deallocate(object);
 870                                         thread_interrupt_level(interruptible_state);
 871                                         return VM_FAULT_RETRY;
 872                                 }
 873                         }
 874                         /*
 875                          *      We mark the page busy and leave it on
 876                          *      the pageout queues.  If the pageout
 877                          *      deamon comes across it, then it will
 878                          *      remove the page.
 879                          */
 880
 881 #if TRACEFAULTPAGE
 882                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 883 #endif
 884
 885 #if     !VM_FAULT_STATIC_CONFIG
 886                         if (!software_reference_bits) {
 887                                 vm_page_lock_queues();
 888                                 if (m->inactive)
 889                                         vm_stat.reactivations++;
 890
 891                                 VM_PAGE_QUEUES_REMOVE(m);
 892                                 vm_page_unlock_queues();
 893                         }
 894 #endif
 895                         XPR(XPR_VM_FAULT,
 896                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
 897                                 (integer_t)object, offset, (integer_t)m, 0, 0);
 898                         assert(!m->busy);
 899                         m->busy = TRUE;
 900                         assert(!m->absent);
 901                         break;
 902                 }
 903
 904                 look_for_page =
 905                         (object->pager_created) &&
 906                           LOOK_FOR(object, offset) &&
 907                             (!data_supply);
 908
 909 #if TRACEFAULTPAGE
 910                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
 911 #endif
 912                 if ((look_for_page || (object == first_object))
 913                                 && !must_be_resident
 914                                 && !(object->phys_contiguous))  {
 915                         /*
 916                          *      Allocate a new page for this object/offset
 917                          *      pair.
 918                          */
 919
 920                         m = vm_page_grab_fictitious();
 921 #if TRACEFAULTPAGE
 922                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 923 #endif
 924                         if (m == VM_PAGE_NULL) {
 925                                 vm_fault_cleanup(object, first_m);
 926                                 thread_interrupt_level(interruptible_state);
 927                                 return(VM_FAULT_FICTITIOUS_SHORTAGE);
 928                         }
 929                         vm_page_insert(m, object, offset);
 930                 }
 931
 932                 if ((look_for_page && !must_be_resident)) {
 933                         kern_return_t   rc;
 934
 935                         /*
 936                          *      If the memory manager is not ready, we
 937                          *      cannot make requests.
 938                          */
 939                         if (!object->pager_ready) {
 940 #if TRACEFAULTPAGE
 941                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 942 #endif
 943                                 if(m != VM_PAGE_NULL)
 944                                         VM_PAGE_FREE(m);
 945                                 XPR(XPR_VM_FAULT,
 946                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
 947                                         (integer_t)object, offset, 0, 0, 0);
 948                                 /* take an extra ref so object won't die */
 949                                 assert(object->ref_count > 0);
 950                                 object->ref_count++;
 951                                 vm_object_res_reference(object);
 952                                 vm_fault_cleanup(object, first_m);
 953                                 counter(c_vm_fault_page_block_backoff_kernel++);
 954                                 vm_object_lock(object);
 955                                 assert(object->ref_count > 0);
 956                                 if (!object->pager_ready) {
 957                                         wait_result = vm_object_assert_wait(object,
 958                                                               VM_OBJECT_EVENT_PAGER_READY,
 959                                                               interruptible);
 960                                         vm_object_unlock(object);
 961                                         if (wait_result == THREAD_WAITING)
 962                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
 963                                         vm_object_deallocate(object);
 964                                         goto backoff;
 965                                 } else {
 966                                         vm_object_unlock(object);
 967                                         vm_object_deallocate(object);
 968                                         thread_interrupt_level(interruptible_state);
 969                                         return VM_FAULT_RETRY;
 970                                 }
 971                         }
 972
 973                         if(object->phys_contiguous) {
 974                                 if(m != VM_PAGE_NULL) {
 975                                         VM_PAGE_FREE(m);
 976                                         m = VM_PAGE_NULL;
 977                                 }
 978                                 goto no_clustering;
 979                         }
 980                         if (object->internal) {
 981                                 /*
 982                                  *      Requests to the default pager
 983                                  *      must reserve a real page in advance,
 984                                  *      because the pager's data-provided
 985                                  *      won't block for pages.  IMPORTANT:
 986                                  *      this acts as a throttling mechanism
 987                                  *      for data_requests to the default
 988                                  *      pager.
 989                                  */
 990
 991 #if TRACEFAULTPAGE
 992                                 dbgTrace(0xBEEF000F, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 993 #endif
 994                                 if (m->fictitious && !vm_page_convert(m)) {
 995                                         VM_PAGE_FREE(m);
 996                                         vm_fault_cleanup(object, first_m);
 997                                         thread_interrupt_level(interruptible_state);
 998                                         return(VM_FAULT_MEMORY_SHORTAGE);
 999                                 }
1000                         } else if (object->absent_count >
1001                                                 vm_object_absent_max) {
1002                                 /*
1003                                  *      If there are too many outstanding page
1004                                  *      requests pending on this object, we
1005                                  *      wait for them to be resolved now.
1006                                  */
1007
1008 #if TRACEFAULTPAGE
1009                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1010 #endif
1011                                 if(m != VM_PAGE_NULL)
1012                                         VM_PAGE_FREE(m);
1013                                 /* take an extra ref so object won't die */
1014                                 assert(object->ref_count > 0);
1015                                 object->ref_count++;
1016                                 vm_object_res_reference(object);
1017                                 vm_fault_cleanup(object, first_m);
1018                                 counter(c_vm_fault_page_block_backoff_kernel++);
1019                                 vm_object_lock(object);
1020                                 assert(object->ref_count > 0);
1021                                 if (object->absent_count > vm_object_absent_max) {
1022                                         vm_object_absent_assert_wait(object,
1023                                                                      interruptible);
1024                                         vm_object_unlock(object);
1025                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1026                                         vm_object_deallocate(object);
1027                                         goto backoff;
1028                                 } else {
1029                                         vm_object_unlock(object);
1030                                         vm_object_deallocate(object);
1031                                         thread_interrupt_level(interruptible_state);
1032                                         return VM_FAULT_RETRY;
1033                                 }
1034                         }
1035
1036                         /*
1037                          *      Indicate that the page is waiting for data
1038                          *      from the memory manager.
1039                          */
1040
1041                         if(m != VM_PAGE_NULL) {
1042
1043                                 m->list_req_pending = TRUE;
1044                                 m->absent = TRUE;
1045                                 m->unusual = TRUE;
1046                                 object->absent_count++;
1047
1048                         }
1049
1050 no_clustering:
1051                         cluster_start = offset;
1052                         length = PAGE_SIZE;
1053
1054                         /*
1055                          * lengthen the cluster by the pages in the working set
1056                          */
1057                         if((map != NULL) &&
1058                                 (current_task()->dynamic_working_set != 0)) {
1059                                 cluster_end = cluster_start + length;
1060                                 /* tws values for start and end are just a
1061                                  * suggestions.  Therefore, as long as
1062                                  * build_cluster does not use pointers or
1063                                  * take action based on values that
1064                                  * could be affected by re-entrance we
1065                                  * do not need to take the map lock.
1066                                  */
1067                                 cluster_end = offset + PAGE_SIZE_64;
1068                                 tws_build_cluster((tws_hash_t)
1069                                         current_task()->dynamic_working_set,
1070                                         object, &cluster_start,
1071                                         &cluster_end, 0x40000);
1072                                 length = cluster_end - cluster_start;
1073                         }
1074 #if TRACEFAULTPAGE
1075                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1076 #endif
1077                         /*
1078                          *      We have a busy page, so we can
1079                          *      release the object lock.
1080                          */
1081                         vm_object_unlock(object);
1082
1083                         /*
1084                          *      Call the memory manager to retrieve the data.
1085                          */
1086
1087                         if (type_of_fault)
1088                                 *type_of_fault = (length << 8) | DBG_PAGEIN_FAULT;
1089                         VM_STAT(pageins++);
1090                         current_task()->pageins++;
1091                         bumped_pagein = TRUE;
1092
1093                         /*
1094                          *      If this object uses a copy_call strategy,
1095                          *      and we are interested in a copy of this object
1096                          *      (having gotten here only by following a
1097                          *      shadow chain), then tell the memory manager
1098                          *      via a flag added to the desired_access
1099                          *      parameter, so that it can detect a race
1100                          *      between our walking down the shadow chain
1101                          *      and its pushing pages up into a copy of
1102                          *      the object that it manages.
1103                          */
1104
1105                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL &&
1106                             object != first_object) {
1107                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1108                         } else {
1109                                 wants_copy_flag = VM_PROT_NONE;
1110                         }
1111
1112                         XPR(XPR_VM_FAULT,
1113                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1114                                 (integer_t)object, offset, (integer_t)m,
1115                                 access_required | wants_copy_flag, 0);
1116
1117                         rc = memory_object_data_request(object->pager,
1118                                         cluster_start + object->paging_offset,
1119                                         length,
1120                                         access_required | wants_copy_flag);
1121
1122
1123 #if TRACEFAULTPAGE
1124                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1125 #endif
1126                         if (rc != KERN_SUCCESS) {
1127                                 if (rc != MACH_SEND_INTERRUPTED
1128                                     && vm_fault_debug)
1129                                         printf("%s(0x%x, 0x%x, 0x%x, 0x%x) failed, rc=%d\n",
1130                                                 "memory_object_data_request",
1131                                                 object->pager,
1132                                                 cluster_start + object->paging_offset,
1133                                                 length, access_required, rc);
1134                                 /*
1135                                  *      Don't want to leave a busy page around,
1136                                  *      but the data request may have blocked,
1137                                  *      so check if it's still there and busy.
1138                                  */
1139                                 if(!object->phys_contiguous) {
1140                                    vm_object_lock(object);
1141                                    for (; length; length -= PAGE_SIZE,
1142                                       cluster_start += PAGE_SIZE_64) {
1143                                       vm_page_t p;
1144                                       if ((p = vm_page_lookup(object,
1145                                                                 cluster_start))
1146                                             && p->absent && p->busy
1147                                             && p != first_m) {
1148                                          VM_PAGE_FREE(p);
1149                                       }
1150                                    }
1151                                 }
1152                                 vm_fault_cleanup(object, first_m);
1153                                 thread_interrupt_level(interruptible_state);
1154                                 return((rc == MACH_SEND_INTERRUPTED) ?
1155                                         VM_FAULT_INTERRUPTED :
1156                                         VM_FAULT_MEMORY_ERROR);
1157                         } else {
1158 #ifdef notdefcdy
1159                                 tws_hash_line_t line;
1160                                 task_t          task;
1161
1162                                 task = current_task();
1163
1164                                 if((map != NULL) &&
1165                                         (task->dynamic_working_set != 0))
1166                                                 && !(object->private)) {
1167                                         vm_object_t     base_object;
1168                                         vm_object_offset_t base_offset;
1169                                         base_object = object;
1170                                         base_offset = offset;
1171                                         while(base_object->shadow) {
1172                                                 base_offset +=
1173                                                   base_object->shadow_offset;
1174                                                 base_object =
1175                                                   base_object->shadow;
1176                                         }
1177                                         if(tws_lookup
1178                                                 ((tws_hash_t)
1179                                                 task->dynamic_working_set,
1180                                                 base_offset, base_object,
1181                                                 &line) == KERN_SUCCESS) {
1182                                                 tws_line_signal((tws_hash_t)
1183                                                 task->dynamic_working_set,
1184                                                         map, line, vaddr);
1185                                         }
1186                                 }
1187 #endif
1188                         }
1189
1190                         /*
1191                          * Retry with same object/offset, since new data may
1192                          * be in a different page (i.e., m is meaningless at
1193                          * this point).
1194                          */
1195                         vm_object_lock(object);
1196                         if ((interruptible != THREAD_UNINT) &&
1197                             (current_thread()->state & TH_ABORT)) {
1198                                 vm_fault_cleanup(object, first_m);
1199                                 thread_interrupt_level(interruptible_state);
1200                                 return(VM_FAULT_INTERRUPTED);
1201                         }
1202                         if(m == VM_PAGE_NULL)
1203                                 break;
1204                         continue;
1205                 }
1206
1207                 /*
1208                  * The only case in which we get here is if
1209                  * object has no pager (or unwiring).  If the pager doesn't
1210                  * have the page this is handled in the m->absent case above
1211                  * (and if you change things here you should look above).
1212                  */
1213 #if TRACEFAULTPAGE
1214                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1215 #endif
1216                 if (object == first_object)
1217                         first_m = m;
1218                 else
1219                         assert(m == VM_PAGE_NULL);
1220
1221                 XPR(XPR_VM_FAULT,
1222                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1223                         (integer_t)object, offset, (integer_t)m,
1224                         (integer_t)object->shadow, 0);
1225                 /*
1226                  *      Move on to the next object.  Lock the next
1227                  *      object before unlocking the current one.
1228                  */
1229                 next_object = object->shadow;
1230                 if (next_object == VM_OBJECT_NULL) {
1231                         assert(!must_be_resident);
1232                         /*
1233                          *      If there's no object left, fill the page
1234                          *      in the top object with zeros.  But first we
1235                          *      need to allocate a real page.
1236                          */
1237
1238                         if (object != first_object) {
1239                                 vm_object_paging_end(object);
1240                                 vm_object_unlock(object);
1241
1242                                 object = first_object;
1243                                 offset = first_offset;
1244                                 vm_object_lock(object);
1245                         }
1246
1247                         m = first_m;
1248                         assert(m->object == object);
1249                         first_m = VM_PAGE_NULL;
1250
1251                         if (object->shadow_severed) {
1252                                 VM_PAGE_FREE(m);
1253                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1254                                 thread_interrupt_level(interruptible_state);
1255                                 return VM_FAULT_MEMORY_ERROR;
1256                         }
1257
1258                         if (VM_PAGE_THROTTLED() ||
1259                             (m->fictitious && !vm_page_convert(m))) {
1260                                 VM_PAGE_FREE(m);
1261                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1262                                 thread_interrupt_level(interruptible_state);
1263                                 return(VM_FAULT_MEMORY_SHORTAGE);
1264                         }
1265                         m->no_isync = FALSE;
1266
1267                         if (!no_zero_fill) {
1268                                 vm_object_unlock(object);
1269                                 vm_page_zero_fill(m);
1270                                 if (type_of_fault)
1271                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
1272                                 VM_STAT(zero_fill_count++);
1273
1274                                 if (bumped_pagein == TRUE) {
1275                                         VM_STAT(pageins--);
1276                                         current_task()->pageins--;
1277                                 }
1278                                 vm_object_lock(object);
1279                         }
1280                         vm_page_lock_queues();
1281                         VM_PAGE_QUEUES_REMOVE(m);
1282                         if(m->object->size > 0x80000) {
1283                                 m->zero_fill = TRUE;
1284                                 /* depends on the queues lock */
1285                                 vm_zf_count += 1;
1286                                 queue_enter(&vm_page_queue_zf,
1287                                         m, vm_page_t, pageq);
1288                         } else {
1289                                 queue_enter(
1290                                         &vm_page_queue_inactive,
1291                                         m, vm_page_t, pageq);
1292                         }
1293                         m->page_ticket = vm_page_ticket;
1294                         vm_page_ticket_roll++;
1295                         if(vm_page_ticket_roll == VM_PAGE_TICKETS_IN_ROLL) {
1296                                 vm_page_ticket_roll = 0;
1297                                 if(vm_page_ticket ==
1298                                         VM_PAGE_TICKET_ROLL_IDS)
1299                                         vm_page_ticket= 0;
1300                                 else
1301                                         vm_page_ticket++;
1302                         }
1303                         m->inactive = TRUE;
1304                         vm_page_inactive_count++;
1305                         vm_page_unlock_queues();
1306                         pmap_clear_modify(m->phys_addr);
1307                         break;
1308                 }
1309                 else {
1310                         if ((object != first_object) || must_be_resident)
1311                                 vm_object_paging_end(object);
1312                         offset += object->shadow_offset;
1313                         hi_offset += object->shadow_offset;
1314                         lo_offset += object->shadow_offset;
1315                         access_required = VM_PROT_READ;
1316                         vm_object_lock(next_object);
1317                         vm_object_unlock(object);
1318                         object = next_object;
1319                         vm_object_paging_begin(object);
1320                 }
1321         }
1322
1323         /*
1324          *      PAGE HAS BEEN FOUND.
1325          *
1326          *      This page (m) is:
1327          *              busy, so that we can play with it;
1328          *              not absent, so that nobody else will fill it;
1329          *              possibly eligible for pageout;
1330          *
1331          *      The top-level page (first_m) is:
1332          *              VM_PAGE_NULL if the page was found in the
1333          *               top-level object;
1334          *              busy, not absent, and ineligible for pageout.
1335          *
1336          *      The current object (object) is locked.  A paging
1337          *      reference is held for the current and top-level
1338          *      objects.
1339          */
1340
1341 #if TRACEFAULTPAGE
1342         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1343 #endif
1344 #if     EXTRA_ASSERTIONS
1345         if(m != VM_PAGE_NULL) {
1346                 assert(m->busy && !m->absent);
1347                 assert((first_m == VM_PAGE_NULL) ||
1348                         (first_m->busy && !first_m->absent &&
1349                          !first_m->active && !first_m->inactive));
1350         }
1351 #endif  /* EXTRA_ASSERTIONS */
1352
1353         XPR(XPR_VM_FAULT,
1354        "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1355                 (integer_t)object, offset, (integer_t)m,
1356                 (integer_t)first_object, (integer_t)first_m);
1357         /*
1358          *      If the page is being written, but isn't
1359          *      already owned by the top-level object,
1360          *      we have to copy it into a new page owned
1361          *      by the top-level object.
1362          */
1363
1364         if ((object != first_object) && (m != VM_PAGE_NULL)) {
1365                 /*
1366                  *      We only really need to copy if we
1367                  *      want to write it.
1368                  */
1369
1370 #if TRACEFAULTPAGE
1371                         dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1372 #endif
1373                 if (fault_type & VM_PROT_WRITE) {
1374                         vm_page_t copy_m;
1375
1376                         assert(!must_be_resident);
1377
1378                         /*
1379                          *      If we try to collapse first_object at this
1380                          *      point, we may deadlock when we try to get
1381                          *      the lock on an intermediate object (since we
1382                          *      have the bottom object locked).  We can't
1383                          *      unlock the bottom object, because the page
1384                          *      we found may move (by collapse) if we do.
1385                          *
1386                          *      Instead, we first copy the page.  Then, when
1387                          *      we have no more use for the bottom object,
1388                          *      we unlock it and try to collapse.
1389                          *
1390                          *      Note that we copy the page even if we didn't
1391                          *      need to... that's the breaks.
1392                          */
1393
1394                         /*
1395                          *      Allocate a page for the copy
1396                          */
1397                         copy_m = vm_page_grab();
1398                         if (copy_m == VM_PAGE_NULL) {
1399                                 RELEASE_PAGE(m);
1400                                 vm_fault_cleanup(object, first_m);
1401                                 thread_interrupt_level(interruptible_state);
1402                                 return(VM_FAULT_MEMORY_SHORTAGE);
1403                         }
1404
1405
1406                         XPR(XPR_VM_FAULT,
1407                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1408                                 (integer_t)object, offset,
1409                                 (integer_t)m, (integer_t)copy_m, 0);
1410                         vm_page_copy(m, copy_m);
1411
1412                         /*
1413                          *      If another map is truly sharing this
1414                          *      page with us, we have to flush all
1415                          *      uses of the original page, since we
1416                          *      can't distinguish those which want the
1417                          *      original from those which need the
1418                          *      new copy.
1419                          *
1420                          *      XXXO If we know that only one map has
1421                          *      access to this page, then we could
1422                          *      avoid the pmap_page_protect() call.
1423                          */
1424
1425                         vm_page_lock_queues();
1426                         assert(!m->cleaning);
1427                         pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1428                         vm_page_deactivate(m);
1429                         copy_m->dirty = TRUE;
1430                         /*
1431                          * Setting reference here prevents this fault from
1432                          * being counted as a (per-thread) reactivate as well
1433                          * as a copy-on-write.
1434                          */
1435                         first_m->reference = TRUE;
1436                         vm_page_unlock_queues();
1437
1438                         /*
1439                          *      We no longer need the old page or object.
1440                          */
1441
1442                         PAGE_WAKEUP_DONE(m);
1443                         vm_object_paging_end(object);
1444                         vm_object_unlock(object);
1445
1446                         if (type_of_fault)
1447                                 *type_of_fault = DBG_COW_FAULT;
1448                         VM_STAT(cow_faults++);
1449                         current_task()->cow_faults++;
1450                         object = first_object;
1451                         offset = first_offset;
1452
1453                         vm_object_lock(object);
1454                         VM_PAGE_FREE(first_m);
1455                         first_m = VM_PAGE_NULL;
1456                         assert(copy_m->busy);
1457                         vm_page_insert(copy_m, object, offset);
1458                         m = copy_m;
1459
1460                         /*
1461                          *      Now that we've gotten the copy out of the
1462                          *      way, let's try to collapse the top object.
1463                          *      But we have to play ugly games with
1464                          *      paging_in_progress to do that...
1465                          */
1466
1467                         vm_object_paging_end(object);
1468                         vm_object_collapse(object);
1469                         vm_object_paging_begin(object);
1470
1471                 }
1472                 else {
1473                         *protection &= (~VM_PROT_WRITE);
1474                 }
1475         }
1476
1477         /*
1478          *      Now check whether the page needs to be pushed into the
1479          *      copy object.  The use of asymmetric copy on write for
1480          *      shared temporary objects means that we may do two copies to
1481          *      satisfy the fault; one above to get the page from a
1482          *      shadowed object, and one here to push it into the copy.
1483          */
1484
1485         while ((copy_object = first_object->copy) != VM_OBJECT_NULL &&
1486                    (m!= VM_PAGE_NULL)) {
1487                 vm_object_offset_t      copy_offset;
1488                 vm_page_t               copy_m;
1489
1490 #if TRACEFAULTPAGE
1491                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1492 #endif
1493                 /*
1494                  *      If the page is being written, but hasn't been
1495                  *      copied to the copy-object, we have to copy it there.
1496                  */
1497
1498                 if ((fault_type & VM_PROT_WRITE) == 0) {
1499                         *protection &= ~VM_PROT_WRITE;
1500                         break;
1501                 }
1502
1503                 /*
1504                  *      If the page was guaranteed to be resident,
1505                  *      we must have already performed the copy.
1506                  */
1507
1508                 if (must_be_resident)
1509                         break;
1510
1511                 /*
1512                  *      Try to get the lock on the copy_object.
1513                  */
1514                 if (!vm_object_lock_try(copy_object)) {
1515                         vm_object_unlock(object);
1516
1517                         mutex_pause();  /* wait a bit */
1518
1519                         vm_object_lock(object);
1520                         continue;
1521                 }
1522
1523                 /*
1524                  *      Make another reference to the copy-object,
1525                  *      to keep it from disappearing during the
1526                  *      copy.
1527                  */
1528                 assert(copy_object->ref_count > 0);
1529                 copy_object->ref_count++;
1530                 VM_OBJ_RES_INCR(copy_object);
1531
1532                 /*
1533                  *      Does the page exist in the copy?
1534                  */
1535                 copy_offset = first_offset - copy_object->shadow_offset;
1536                 if (copy_object->size <= copy_offset)
1537                         /*
1538                          * Copy object doesn't cover this page -- do nothing.
1539                          */
1540                         ;
1541                 else if ((copy_m =
1542                         vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1543                         /* Page currently exists in the copy object */
1544                         if (copy_m->busy) {
1545                                 /*
1546                                  *      If the page is being brought
1547                                  *      in, wait for it and then retry.
1548                                  */
1549                                 RELEASE_PAGE(m);
1550                                 /* take an extra ref so object won't die */
1551                                 assert(copy_object->ref_count > 0);
1552                                 copy_object->ref_count++;
1553                                 vm_object_res_reference(copy_object);
1554                                 vm_object_unlock(copy_object);
1555                                 vm_fault_cleanup(object, first_m);
1556                                 counter(c_vm_fault_page_block_backoff_kernel++);
1557                                 vm_object_lock(copy_object);
1558                                 assert(copy_object->ref_count > 0);
1559                                 VM_OBJ_RES_DECR(copy_object);
1560                                 copy_object->ref_count--;
1561                                 assert(copy_object->ref_count > 0);
1562                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1563                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1564                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1565                                         vm_object_unlock(copy_object);
1566                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1567                                         vm_object_deallocate(copy_object);
1568                                         goto backoff;
1569                                 } else {
1570                                         vm_object_unlock(copy_object);
1571                                         vm_object_deallocate(copy_object);
1572                                         thread_interrupt_level(interruptible_state);
1573                                         return VM_FAULT_RETRY;
1574                                 }
1575                         }
1576                 }
1577                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1578                         /*
1579                          * If PAGED_OUT is TRUE, then the page used to exist
1580                          * in the copy-object, and has already been paged out.
1581                          * We don't need to repeat this. If PAGED_OUT is
1582                          * FALSE, then either we don't know (!pager_created,
1583                          * for example) or it hasn't been paged out.
1584                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1585                          * We must copy the page to the copy object.
1586                          */
1587
1588                         /*
1589                          *      Allocate a page for the copy
1590                          */
1591                         copy_m = vm_page_alloc(copy_object, copy_offset);
1592                         if (copy_m == VM_PAGE_NULL) {
1593                                 RELEASE_PAGE(m);
1594                                 VM_OBJ_RES_DECR(copy_object);
1595                                 copy_object->ref_count--;
1596                                 assert(copy_object->ref_count > 0);
1597                                 vm_object_unlock(copy_object);
1598                                 vm_fault_cleanup(object, first_m);
1599                                 thread_interrupt_level(interruptible_state);
1600                                 return(VM_FAULT_MEMORY_SHORTAGE);
1601                         }
1602
1603                         /*
1604                          *      Must copy page into copy-object.
1605                          */
1606
1607                         vm_page_copy(m, copy_m);
1608
1609                         /*
1610                          *      If the old page was in use by any users
1611                          *      of the copy-object, it must be removed
1612                          *      from all pmaps.  (We can't know which
1613                          *      pmaps use it.)
1614                          */
1615
1616                         vm_page_lock_queues();
1617                         assert(!m->cleaning);
1618                         pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1619                         copy_m->dirty = TRUE;
1620                         vm_page_unlock_queues();
1621
1622                         /*
1623                          *      If there's a pager, then immediately
1624                          *      page out this page, using the "initialize"
1625                          *      option.  Else, we use the copy.
1626                          */
1627
1628                         if
1629 #if     MACH_PAGEMAP
1630                           ((!copy_object->pager_created) ||
1631                                 vm_external_state_get(
1632                                         copy_object->existence_map, copy_offset)
1633                                 == VM_EXTERNAL_STATE_ABSENT)
1634 #else
1635                           (!copy_object->pager_created)
1636 #endif
1637                                 {
1638                                 vm_page_lock_queues();
1639                                 vm_page_activate(copy_m);
1640                                 vm_page_unlock_queues();
1641                                 PAGE_WAKEUP_DONE(copy_m);
1642                         }
1643                         else {
1644                                 assert(copy_m->busy == TRUE);
1645
1646                                 /*
1647                                  *      The page is already ready for pageout:
1648                                  *      not on pageout queues and busy.
1649                                  *      Unlock everything except the
1650                                  *      copy_object itself.
1651                                  */
1652
1653                                 vm_object_unlock(object);
1654
1655                                 /*
1656                                  *      Write the page to the copy-object,
1657                                  *      flushing it from the kernel.
1658                                  */
1659
1660                                 vm_pageout_initialize_page(copy_m);
1661
1662                                 /*
1663                                  *      Since the pageout may have
1664                                  *      temporarily dropped the
1665                                  *      copy_object's lock, we
1666                                  *      check whether we'll have
1667                                  *      to deallocate the hard way.
1668                                  */
1669
1670                                 if ((copy_object->shadow != object) ||
1671                                     (copy_object->ref_count == 1)) {
1672                                         vm_object_unlock(copy_object);
1673                                         vm_object_deallocate(copy_object);
1674                                         vm_object_lock(object);
1675                                         continue;
1676                                 }
1677
1678                                 /*
1679                                  *      Pick back up the old object's
1680                                  *      lock.  [It is safe to do so,
1681                                  *      since it must be deeper in the
1682                                  *      object tree.]
1683                                  */
1684
1685                                 vm_object_lock(object);
1686                         }
1687
1688                         /*
1689                          *      Because we're pushing a page upward
1690                          *      in the object tree, we must restart
1691                          *      any faults that are waiting here.
1692                          *      [Note that this is an expansion of
1693                          *      PAGE_WAKEUP that uses the THREAD_RESTART
1694                          *      wait result].  Can't turn off the page's
1695                          *      busy bit because we're not done with it.
1696                          */
1697
1698                         if (m->wanted) {
1699                                 m->wanted = FALSE;
1700                                 thread_wakeup_with_result((event_t) m,
1701                                         THREAD_RESTART);
1702                         }
1703                 }
1704
1705                 /*
1706                  *      The reference count on copy_object must be
1707                  *      at least 2: one for our extra reference,
1708                  *      and at least one from the outside world
1709                  *      (we checked that when we last locked
1710                  *      copy_object).
1711                  */
1712                 copy_object->ref_count--;
1713                 assert(copy_object->ref_count > 0);
1714                 VM_OBJ_RES_DECR(copy_object);
1715                 vm_object_unlock(copy_object);
1716
1717                 break;
1718         }
1719
1720         *result_page = m;
1721         *top_page = first_m;
1722
1723         XPR(XPR_VM_FAULT,
1724                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1725                 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1726         /*
1727          *      If the page can be written, assume that it will be.
1728          *      [Earlier, we restrict the permission to allow write
1729          *      access only if the fault so required, so we don't
1730          *      mark read-only data as dirty.]
1731          */
1732
1733 #if     !VM_FAULT_STATIC_CONFIG
1734         if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE) &&
1735                         (m != VM_PAGE_NULL)) {
1736                 m->dirty = TRUE;
1737         }
1738 #endif
1739 #if TRACEFAULTPAGE
1740         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_page_deactivate_behind);  /* (TEST/DEBUG) */
1741 #endif
1742         if (vm_page_deactivate_behind) {
1743                 if (offset && /* don't underflow */
1744                         (object->last_alloc == (offset - PAGE_SIZE_64))) {
1745                         m = vm_page_lookup(object, object->last_alloc);
1746                         if ((m != VM_PAGE_NULL) && !m->busy) {
1747                                 vm_page_lock_queues();
1748                                 vm_page_deactivate(m);
1749                                 vm_page_unlock_queues();
1750                         }
1751 #if TRACEFAULTPAGE
1752                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1753 #endif
1754                 }
1755                 object->last_alloc = offset;
1756         }
1757 #if TRACEFAULTPAGE
1758         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
1759 #endif
1760         thread_interrupt_level(interruptible_state);
1761         if(*result_page == VM_PAGE_NULL) {
1762                 vm_object_unlock(object);
1763         }
1764         return(VM_FAULT_SUCCESS);
1765
1766 #if 0
1767     block_and_backoff:
1768         vm_fault_cleanup(object, first_m);
1769
1770         counter(c_vm_fault_page_block_backoff_kernel++);
1771         thread_block(THREAD_CONTINUE_NULL);
1772 #endif
1773
1774     backoff:
1775         thread_interrupt_level(interruptible_state);
1776         if (wait_result == THREAD_INTERRUPTED)
1777                 return VM_FAULT_INTERRUPTED;
1778         return VM_FAULT_RETRY;
1779
1780 #undef  RELEASE_PAGE
1781 }
1782
1783 /*
1784  *      Routine:        vm_fault
1785  *      Purpose:
1786  *              Handle page faults, including pseudo-faults
1787  *              used to change the wiring status of pages.
1788  *      Returns:
1789  *              Explicit continuations have been removed.
1790  *      Implementation:
1791  *              vm_fault and vm_fault_page save mucho state
1792  *              in the moral equivalent of a closure.  The state
1793  *              structure is allocated when first entering vm_fault
1794  *              and deallocated when leaving vm_fault.
1795  */
1796
1797 kern_return_t
1798 vm_fault(
1799         vm_map_t        map,
1800         vm_offset_t     vaddr,
1801         vm_prot_t       fault_type,
1802         boolean_t       change_wiring,
1803         int             interruptible,
1804         pmap_t          caller_pmap,
1805         vm_offset_t     caller_pmap_addr)
1806 {
1807         vm_map_version_t        version;        /* Map version for verificiation */
1808         boolean_t               wired;          /* Should mapping be wired down? */
1809         vm_object_t             object;         /* Top-level object */
1810         vm_object_offset_t      offset;         /* Top-level offset */
1811         vm_prot_t               prot;           /* Protection for mapping */
1812         vm_behavior_t           behavior;       /* Expected paging behavior */
1813         vm_object_offset_t      lo_offset, hi_offset;
1814         vm_object_t             old_copy_object; /* Saved copy object */
1815         vm_page_t               result_page;    /* Result of vm_fault_page */
1816         vm_page_t               top_page;       /* Placeholder page */
1817         kern_return_t           kr;
1818
1819         register
1820         vm_page_t               m;      /* Fast access to result_page */
1821         kern_return_t           error_code;     /* page error reasons */
1822         register
1823         vm_object_t             cur_object;
1824         register
1825         vm_object_offset_t      cur_offset;
1826         vm_page_t               cur_m;
1827         vm_object_t             new_object;
1828         int                     type_of_fault;
1829         vm_map_t                pmap_map = map;
1830         vm_map_t                original_map = map;
1831         pmap_t                  pmap = NULL;
1832         boolean_t               funnel_set = FALSE;
1833         funnel_t                *curflock;
1834         thread_t                cur_thread;
1835         boolean_t               interruptible_state;
1836         unsigned int            cache_attr;
1837         int                     write_startup_file = 0;
1838         vm_prot_t               full_fault_type;
1839
1840
1841
1842         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_START,
1843                               vaddr,
1844                               0,
1845                               0,
1846                               0,
1847                               0);
1848
1849         cur_thread = current_thread();
1850         /* at present we do not fully check for execute permission */
1851         /* we generally treat it is read except in certain device  */
1852         /* memory settings */
1853         full_fault_type = fault_type;
1854         if(fault_type & VM_PROT_EXECUTE) {
1855                 fault_type &= ~VM_PROT_EXECUTE;
1856                 fault_type |= VM_PROT_READ;
1857         }
1858
1859         interruptible_state = thread_interrupt_level(interruptible);
1860
1861         /*
1862          * assume we will hit a page in the cache
1863          * otherwise, explicitly override with
1864          * the real fault type once we determine it
1865          */
1866         type_of_fault = DBG_CACHE_HIT_FAULT;
1867
1868         VM_STAT(faults++);
1869         current_task()->faults++;
1870
1871         /*
1872          * drop funnel if it is already held. Then restore while returning
1873          */
1874         if ((cur_thread->funnel_state & TH_FN_OWNED) == TH_FN_OWNED) {
1875                 funnel_set = TRUE;
1876                 curflock = cur_thread->funnel_lock;
1877                 thread_funnel_set( curflock , FALSE);
1878         }
1879
1880     RetryFault: ;
1881
1882         /*
1883          *      Find the backing store object and offset into
1884          *      it to begin the search.
1885          */
1886         map = original_map;
1887         vm_map_lock_read(map);
1888         kr = vm_map_lookup_locked(&map, vaddr, fault_type, &version,
1889                                 &object, &offset,
1890                                 &prot, &wired,
1891                                 &behavior, &lo_offset, &hi_offset, &pmap_map);
1892
1893         pmap = pmap_map->pmap;
1894
1895         if (kr != KERN_SUCCESS) {
1896                 vm_map_unlock_read(map);
1897                 goto done;
1898         }
1899
1900         /*
1901          *      If the page is wired, we must fault for the current protection
1902          *      value, to avoid further faults.
1903          */
1904
1905         if (wired)
1906                 fault_type = prot | VM_PROT_WRITE;
1907
1908 #if     VM_FAULT_CLASSIFY
1909         /*
1910          *      Temporary data gathering code
1911          */
1912         vm_fault_classify(object, offset, fault_type);
1913 #endif
1914         /*
1915          *      Fast fault code.  The basic idea is to do as much as
1916          *      possible while holding the map lock and object locks.
1917          *      Busy pages are not used until the object lock has to
1918          *      be dropped to do something (copy, zero fill, pmap enter).
1919          *      Similarly, paging references aren't acquired until that
1920          *      point, and object references aren't used.
1921          *
1922          *      If we can figure out what to do
1923          *      (zero fill, copy on write, pmap enter) while holding
1924          *      the locks, then it gets done.  Otherwise, we give up,
1925          *      and use the original fault path (which doesn't hold
1926          *      the map lock, and relies on busy pages).
1927          *      The give up cases include:
1928          *              - Have to talk to pager.
1929          *              - Page is busy, absent or in error.
1930          *              - Pager has locked out desired access.
1931          *              - Fault needs to be restarted.
1932          *              - Have to push page into copy object.
1933          *
1934          *      The code is an infinite loop that moves one level down
1935          *      the shadow chain each time.  cur_object and cur_offset
1936          *      refer to the current object being examined. object and offset
1937          *      are the original object from the map.  The loop is at the
1938          *      top level if and only if object and cur_object are the same.
1939          *
1940          *      Invariants:  Map lock is held throughout.  Lock is held on
1941          *              original object and cur_object (if different) when
1942          *              continuing or exiting loop.
1943          *
1944          */
1945
1946
1947         /*
1948          *      If this page is to be inserted in a copy delay object
1949          *      for writing, and if the object has a copy, then the
1950          *      copy delay strategy is implemented in the slow fault page.
1951          */
1952         if (object->copy_strategy != MEMORY_OBJECT_COPY_DELAY ||
1953             object->copy == VM_OBJECT_NULL ||
1954             (fault_type & VM_PROT_WRITE) == 0) {
1955         cur_object = object;
1956         cur_offset = offset;
1957
1958         while (TRUE) {
1959                 m = vm_page_lookup(cur_object, cur_offset);
1960                 if (m != VM_PAGE_NULL) {
1961                         if (m->busy) {
1962                                 wait_result_t   result;
1963
1964                                 if (object != cur_object)
1965                                         vm_object_unlock(object);
1966
1967                                 vm_map_unlock_read(map);
1968                                 if (pmap_map != map)
1969                                         vm_map_unlock(pmap_map);
1970
1971 #if     !VM_FAULT_STATIC_CONFIG
1972                                 if (!vm_fault_interruptible)
1973                                         interruptible = THREAD_UNINT;
1974 #endif
1975                                 result = PAGE_ASSERT_WAIT(m, interruptible);
1976
1977                                 vm_object_unlock(cur_object);
1978
1979                                 if (result == THREAD_WAITING) {
1980                                         result = thread_block(THREAD_CONTINUE_NULL);
1981
1982                                         counter(c_vm_fault_page_block_busy_kernel++);
1983                                 }
1984                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
1985                                         goto RetryFault;
1986
1987                                 kr = KERN_ABORTED;
1988                                 goto done;
1989                         }
1990                         if (m->unusual && (m->error || m->restart || m->private
1991                             || m->absent || (fault_type & m->page_lock))) {
1992
1993                                 /*
1994                                  *      Unusual case. Give up.
1995                                  */
1996                                 break;
1997                         }
1998
1999                         /*
2000                          *      Two cases of map in faults:
2001                          *          - At top level w/o copy object.
2002                          *          - Read fault anywhere.
2003                          *              --> must disallow write.
2004                          */
2005
2006                         if (object == cur_object &&
2007                             object->copy == VM_OBJECT_NULL)
2008                                 goto FastMapInFault;
2009
2010                         if ((fault_type & VM_PROT_WRITE) == 0) {
2011
2012                                 prot &= ~VM_PROT_WRITE;
2013
2014                                 /*
2015                                  *      Set up to map the page ...
2016                                  *      mark the page busy, drop
2017                                  *      locks and take a paging reference
2018                                  *      on the object with the page.
2019                                  */
2020
2021                                 if (object != cur_object) {
2022                                         vm_object_unlock(object);
2023                                         object = cur_object;
2024                                 }
2025 FastMapInFault:
2026                                 m->busy = TRUE;
2027
2028                                 vm_object_paging_begin(object);
2029
2030 FastPmapEnter:
2031                                 /*
2032                                  *      Check a couple of global reasons to
2033                                  *      be conservative about write access.
2034                                  *      Then do the pmap_enter.
2035                                  */
2036 #if     !VM_FAULT_STATIC_CONFIG
2037                                 if (vm_fault_dirty_handling
2038 #if     MACH_KDB
2039                                     || db_watchpoint_list
2040 #endif
2041                                     && (fault_type & VM_PROT_WRITE) == 0)
2042                                         prot &= ~VM_PROT_WRITE;
2043 #else   /* STATIC_CONFIG */
2044 #if     MACH_KDB
2045                                 if (db_watchpoint_list
2046                                     && (fault_type & VM_PROT_WRITE) == 0)
2047                                         prot &= ~VM_PROT_WRITE;
2048 #endif  /* MACH_KDB */
2049 #endif  /* STATIC_CONFIG */
2050                                 if (m->no_isync == TRUE) {
2051                                         pmap_sync_caches_phys(m->phys_addr);
2052                                         m->no_isync = FALSE;
2053                                 }
2054
2055                                 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2056                                 if(caller_pmap) {
2057                                         PMAP_ENTER(caller_pmap,
2058                                                 caller_pmap_addr, m,
2059                                                 prot, cache_attr, wired);
2060                                 } else {
2061                                         PMAP_ENTER(pmap, vaddr, m,
2062                                                 prot, cache_attr, wired);
2063                                 }
2064
2065                                 /*
2066                                  *      Grab the queues lock to manipulate
2067                                  *      the page queues.  Change wiring
2068                                  *      case is obvious.  In soft ref bits
2069                                  *      case activate page only if it fell
2070                                  *      off paging queues, otherwise just
2071                                  *      activate it if it's inactive.
2072                                  *
2073                                  *      NOTE: original vm_fault code will
2074                                  *      move active page to back of active
2075                                  *      queue.  This code doesn't.
2076                                  */
2077                                 vm_page_lock_queues();
2078
2079                                 if (m->clustered) {
2080                                         vm_pagein_cluster_used++;
2081                                         m->clustered = FALSE;
2082                                 }
2083                                 m->reference = TRUE;
2084
2085                                 if (change_wiring) {
2086                                         if (wired)
2087                                                 vm_page_wire(m);
2088                                         else
2089                                                 vm_page_unwire(m);
2090                                 }
2091 #if VM_FAULT_STATIC_CONFIG
2092                                 else {
2093                                         if (!m->active && !m->inactive)
2094                                                 vm_page_activate(m);
2095                                 }
2096 #else
2097                                 else if (software_reference_bits) {
2098                                         if (!m->active && !m->inactive)
2099                                                 vm_page_activate(m);
2100                                 }
2101                                 else if (!m->active) {
2102                                         vm_page_activate(m);
2103                                 }
2104 #endif
2105                                 vm_page_unlock_queues();
2106
2107                                 /*
2108                                  *      That's it, clean up and return.
2109                                  */
2110                                 PAGE_WAKEUP_DONE(m);
2111                                 vm_object_paging_end(object);
2112
2113                                 {
2114                                    tws_hash_line_t      line;
2115                                    task_t               task;
2116
2117                                    task = current_task();
2118                                    if((map != NULL) &&
2119                                         (task->dynamic_working_set != 0) &&
2120                                                 !(object->private)) {
2121                                         kern_return_t   kr;
2122                                         vm_object_t     base_object;
2123                                         vm_object_offset_t base_offset;
2124                                         base_object = object;
2125                                         base_offset = cur_offset;
2126                                         while(base_object->shadow) {
2127                                                 base_offset +=
2128                                                  base_object->shadow_offset;
2129                                                 base_object =
2130                                                  base_object->shadow;
2131                                         }
2132                                         kr = tws_lookup((tws_hash_t)
2133                                                 task->dynamic_working_set,
2134                                                 base_offset, base_object,
2135                                                 &line);
2136                                         if(kr == KERN_OPERATION_TIMED_OUT){
2137                                                 write_startup_file = 1;
2138                                         } else if (kr != KERN_SUCCESS) {
2139                                                 kr = tws_insert((tws_hash_t)
2140                                                    task->dynamic_working_set,
2141                                                    base_offset, base_object,
2142                                                    vaddr, pmap_map);
2143                                                 if(kr == KERN_NO_SPACE) {
2144                                                   vm_object_unlock(object);
2145
2146                                                    tws_expand_working_set(
2147                                                       task->dynamic_working_set,
2148                                                       TWS_HASH_LINE_COUNT,
2149                                                       FALSE);
2150
2151                                                    vm_object_lock(object);
2152                                                 }
2153                                                 if(kr ==
2154                                                    KERN_OPERATION_TIMED_OUT) {
2155                                                         write_startup_file = 1;
2156                                                 }
2157                                         }
2158                                    }
2159                                 }
2160                                 vm_object_unlock(object);
2161
2162                                 vm_map_unlock_read(map);
2163                                 if(pmap_map != map)
2164                                         vm_map_unlock(pmap_map);
2165
2166                                 if(write_startup_file)
2167                                         tws_send_startup_info(current_task());
2168
2169                                 if (funnel_set)
2170                                         thread_funnel_set( curflock, TRUE);
2171
2172                                 thread_interrupt_level(interruptible_state);
2173
2174
2175                                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2176                                                       vaddr,
2177                                                       type_of_fault & 0xff,
2178                                                       KERN_SUCCESS,
2179                                                       type_of_fault >> 8,
2180                                                       0);
2181
2182                                 return KERN_SUCCESS;
2183                         }
2184
2185                         /*
2186                          *      Copy on write fault.  If objects match, then
2187                          *      object->copy must not be NULL (else control
2188                          *      would be in previous code block), and we
2189                          *      have a potential push into the copy object
2190                          *      with which we won't cope here.
2191                          */
2192
2193                         if (cur_object == object)
2194                                 break;
2195                         /*
2196                          *      This is now a shadow based copy on write
2197                          *      fault -- it requires a copy up the shadow
2198                          *      chain.
2199                          *
2200                          *      Allocate a page in the original top level
2201                          *      object. Give up if allocate fails.  Also
2202                          *      need to remember current page, as it's the
2203                          *      source of the copy.
2204                          */
2205                         cur_m = m;
2206                         m = vm_page_grab();
2207                         if (m == VM_PAGE_NULL) {
2208                                 break;
2209                         }
2210                         /*
2211                          *      Now do the copy.  Mark the source busy
2212                          *      and take out paging references on both
2213                          *      objects.
2214                          *
2215                          *      NOTE: This code holds the map lock across
2216                          *      the page copy.
2217                          */
2218
2219                         cur_m->busy = TRUE;
2220                         vm_page_copy(cur_m, m);
2221                         vm_page_insert(m, object, offset);
2222
2223                         vm_object_paging_begin(cur_object);
2224                         vm_object_paging_begin(object);
2225
2226                         type_of_fault = DBG_COW_FAULT;
2227                         VM_STAT(cow_faults++);
2228                         current_task()->cow_faults++;
2229
2230                         /*
2231                          *      Now cope with the source page and object
2232                          *      If the top object has a ref count of 1
2233                          *      then no other map can access it, and hence
2234                          *      it's not necessary to do the pmap_page_protect.
2235                          */
2236
2237
2238                         vm_page_lock_queues();
2239                         vm_page_deactivate(cur_m);
2240                         m->dirty = TRUE;
2241                         pmap_page_protect(cur_m->phys_addr,
2242                                                   VM_PROT_NONE);
2243                         vm_page_unlock_queues();
2244
2245                         PAGE_WAKEUP_DONE(cur_m);
2246                         vm_object_paging_end(cur_object);
2247                         vm_object_unlock(cur_object);
2248
2249                         /*
2250                          *      Slight hack to call vm_object collapse
2251                          *      and then reuse common map in code.
2252                          *      note that the object lock was taken above.
2253                          */
2254
2255                         vm_object_paging_end(object);
2256                         vm_object_collapse(object);
2257                         vm_object_paging_begin(object);
2258
2259                         goto FastPmapEnter;
2260                 }
2261                 else {
2262
2263                         /*
2264                          *      No page at cur_object, cur_offset
2265                          */
2266
2267                         if (cur_object->pager_created) {
2268
2269                                 /*
2270                                  *      Have to talk to the pager.  Give up.
2271                                  */
2272                                 break;
2273                         }
2274
2275
2276                         if (cur_object->shadow == VM_OBJECT_NULL) {
2277
2278                                 if (cur_object->shadow_severed) {
2279                                         vm_object_paging_end(object);
2280                                         vm_object_unlock(object);
2281                                         vm_map_unlock_read(map);
2282                                         if(pmap_map != map)
2283                                                 vm_map_unlock(pmap_map);
2284
2285                                         if(write_startup_file)
2286                                                 tws_send_startup_info(
2287                                                                 current_task());
2288
2289                                         if (funnel_set) {
2290                                                 thread_funnel_set( curflock, TRUE);
2291                                                 funnel_set = FALSE;
2292                                         }
2293                                         thread_interrupt_level(interruptible_state);
2294
2295                                         return VM_FAULT_MEMORY_ERROR;
2296                                 }
2297
2298                                 /*
2299                                  *      Zero fill fault.  Page gets
2300                                  *      filled in top object. Insert
2301                                  *      page, then drop any lower lock.
2302                                  *      Give up if no page.
2303                                  */
2304                                 if ((vm_page_free_target -
2305                                    ((vm_page_free_target-vm_page_free_min)>>2))
2306                                                 > vm_page_free_count) {
2307                                         break;
2308                                 }
2309                                 m = vm_page_alloc(object, offset);
2310                                 if (m == VM_PAGE_NULL) {
2311                                         break;
2312                                 }
2313                                 /*
2314                                  * This is a zero-fill or initial fill
2315                                  * page fault.  As such, we consider it
2316                                  * undefined with respect to instruction
2317                                  * execution.  i.e. it is the responsibility
2318                                  * of higher layers to call for an instruction
2319                                  * sync after changing the contents and before
2320                                  * sending a program into this area.  We
2321                                  * choose this approach for performance
2322                                  */
2323
2324                                 m->no_isync = FALSE;
2325
2326                                 if (cur_object != object)
2327                                         vm_object_unlock(cur_object);
2328
2329                                 vm_object_paging_begin(object);
2330                                 vm_object_unlock(object);
2331
2332                                 /*
2333                                  *      Now zero fill page and map it.
2334                                  *      the page is probably going to
2335                                  *      be written soon, so don't bother
2336                                  *      to clear the modified bit
2337                                  *
2338                                  *      NOTE: This code holds the map
2339                                  *      lock across the zero fill.
2340                                  */
2341
2342                                 if (!map->no_zero_fill) {
2343                                         vm_page_zero_fill(m);
2344                                         type_of_fault = DBG_ZERO_FILL_FAULT;
2345                                         VM_STAT(zero_fill_count++);
2346                                 }
2347                                 vm_page_lock_queues();
2348                                 VM_PAGE_QUEUES_REMOVE(m);
2349
2350                                 m->page_ticket = vm_page_ticket;
2351                                 if(m->object->size > 0x80000) {
2352                                         m->zero_fill = TRUE;
2353                                         /* depends on the queues lock */
2354                                         vm_zf_count += 1;
2355                                         queue_enter(&vm_page_queue_zf,
2356                                                 m, vm_page_t, pageq);
2357                                 } else {
2358                                         queue_enter(
2359                                                 &vm_page_queue_inactive,
2360                                                 m, vm_page_t, pageq);
2361                                 }
2362                                 vm_page_ticket_roll++;
2363                                 if(vm_page_ticket_roll ==
2364                                                 VM_PAGE_TICKETS_IN_ROLL) {
2365                                         vm_page_ticket_roll = 0;
2366                                         if(vm_page_ticket ==
2367                                                 VM_PAGE_TICKET_ROLL_IDS)
2368                                                 vm_page_ticket= 0;
2369                                         else
2370                                                 vm_page_ticket++;
2371                                 }
2372
2373                                 m->inactive = TRUE;
2374                                 vm_page_inactive_count++;
2375                                 vm_page_unlock_queues();
2376                                 vm_object_lock(object);
2377
2378                                 goto FastPmapEnter;
2379                         }
2380
2381                         /*
2382                          *      On to the next level
2383                          */
2384
2385                         cur_offset += cur_object->shadow_offset;
2386                         new_object = cur_object->shadow;
2387                         vm_object_lock(new_object);
2388                         if (cur_object != object)
2389                                 vm_object_unlock(cur_object);
2390                         cur_object = new_object;
2391
2392                         continue;
2393                 }
2394         }
2395
2396         /*
2397          *      Cleanup from fast fault failure.  Drop any object
2398          *      lock other than original and drop map lock.
2399          */
2400
2401         if (object != cur_object)
2402                 vm_object_unlock(cur_object);
2403         }
2404         vm_map_unlock_read(map);
2405
2406         if(pmap_map != map)
2407                 vm_map_unlock(pmap_map);
2408
2409         /*
2410          *      Make a reference to this object to
2411          *      prevent its disposal while we are messing with
2412          *      it.  Once we have the reference, the map is free
2413          *      to be diddled.  Since objects reference their
2414          *      shadows (and copies), they will stay around as well.
2415          */
2416
2417         assert(object->ref_count > 0);
2418         object->ref_count++;
2419         vm_object_res_reference(object);
2420         vm_object_paging_begin(object);
2421
2422         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2423         {
2424                 tws_hash_line_t line;
2425                 task_t          task;
2426                 kern_return_t   kr;
2427
2428                    task = current_task();
2429                    if((map != NULL) &&
2430                         (task->dynamic_working_set != 0)
2431                                 && !(object->private)) {
2432                         vm_object_t     base_object;
2433                         vm_object_offset_t base_offset;
2434                         base_object = object;
2435                         base_offset = offset;
2436                         while(base_object->shadow) {
2437                                 base_offset +=
2438                                  base_object->shadow_offset;
2439                                 base_object =
2440                                  base_object->shadow;
2441                         }
2442                         kr = tws_lookup((tws_hash_t)
2443                                 task->dynamic_working_set,
2444                                 base_offset, base_object,
2445                                 &line);
2446                         if(kr == KERN_OPERATION_TIMED_OUT){
2447                                 write_startup_file = 1;
2448                         } else if (kr != KERN_SUCCESS) {
2449                                 tws_insert((tws_hash_t)
2450                                    task->dynamic_working_set,
2451                                    base_offset, base_object,
2452                                    vaddr, pmap_map);
2453                                 kr = tws_insert((tws_hash_t)
2454                                            task->dynamic_working_set,
2455                                            base_offset, base_object,
2456                                            vaddr, pmap_map);
2457                                 if(kr == KERN_NO_SPACE) {
2458                                         vm_object_unlock(object);
2459                                         tws_expand_working_set(
2460                                            task->dynamic_working_set,
2461                                            TWS_HASH_LINE_COUNT,
2462                                            FALSE);
2463                                         vm_object_lock(object);
2464                                 }
2465                                 if(kr == KERN_OPERATION_TIMED_OUT) {
2466                                         write_startup_file = 1;
2467                                 }
2468                         }
2469                 }
2470         }
2471         kr = vm_fault_page(object, offset, fault_type,
2472                            (change_wiring && !wired),
2473                            interruptible,
2474                            lo_offset, hi_offset, behavior,
2475                            &prot, &result_page, &top_page,
2476                            &type_of_fault,
2477                            &error_code, map->no_zero_fill, FALSE, map, vaddr);
2478
2479         /*
2480          *      If we didn't succeed, lose the object reference immediately.
2481          */
2482
2483         if (kr != VM_FAULT_SUCCESS)
2484                 vm_object_deallocate(object);
2485
2486         /*
2487          *      See why we failed, and take corrective action.
2488          */
2489
2490         switch (kr) {
2491                 case VM_FAULT_SUCCESS:
2492                         break;
2493                 case VM_FAULT_MEMORY_SHORTAGE:
2494                         if (vm_page_wait((change_wiring) ?
2495                                          THREAD_UNINT :
2496                                          THREAD_ABORTSAFE))
2497                                 goto RetryFault;
2498                         /* fall thru */
2499                 case VM_FAULT_INTERRUPTED:
2500                         kr = KERN_ABORTED;
2501                         goto done;
2502                 case VM_FAULT_RETRY:
2503                         goto RetryFault;
2504                 case VM_FAULT_FICTITIOUS_SHORTAGE:
2505                         vm_page_more_fictitious();
2506                         goto RetryFault;
2507                 case VM_FAULT_MEMORY_ERROR:
2508                         if (error_code)
2509                                 kr = error_code;
2510                         else
2511                                 kr = KERN_MEMORY_ERROR;
2512                         goto done;
2513         }
2514
2515         m = result_page;
2516
2517         if(m != VM_PAGE_NULL) {
2518                 assert((change_wiring && !wired) ?
2519                     (top_page == VM_PAGE_NULL) :
2520                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
2521         }
2522
2523         /*
2524          *      How to clean up the result of vm_fault_page.  This
2525          *      happens whether the mapping is entered or not.
2526          */
2527
2528 #define UNLOCK_AND_DEALLOCATE                           \
2529         MACRO_BEGIN                                     \
2530         vm_fault_cleanup(m->object, top_page);          \
2531         vm_object_deallocate(object);                   \
2532         MACRO_END
2533
2534         /*
2535          *      What to do with the resulting page from vm_fault_page
2536          *      if it doesn't get entered into the physical map:
2537          */
2538
2539 #define RELEASE_PAGE(m)                                 \
2540         MACRO_BEGIN                                     \
2541         PAGE_WAKEUP_DONE(m);                            \
2542         vm_page_lock_queues();                          \
2543         if (!m->active && !m->inactive)                 \
2544                 vm_page_activate(m);                    \
2545         vm_page_unlock_queues();                        \
2546         MACRO_END
2547
2548         /*
2549          *      We must verify that the maps have not changed
2550          *      since our last lookup.
2551          */
2552
2553         if(m != VM_PAGE_NULL) {
2554                 old_copy_object = m->object->copy;
2555                 vm_object_unlock(m->object);
2556         } else {
2557                 old_copy_object = VM_OBJECT_NULL;
2558         }
2559         if ((map != original_map) || !vm_map_verify(map, &version)) {
2560                 vm_object_t             retry_object;
2561                 vm_object_offset_t      retry_offset;
2562                 vm_prot_t               retry_prot;
2563
2564                 /*
2565                  *      To avoid trying to write_lock the map while another
2566                  *      thread has it read_locked (in vm_map_pageable), we
2567                  *      do not try for write permission.  If the page is
2568                  *      still writable, we will get write permission.  If it
2569                  *      is not, or has been marked needs_copy, we enter the
2570                  *      mapping without write permission, and will merely
2571                  *      take another fault.
2572                  */
2573                 map = original_map;
2574                 vm_map_lock_read(map);
2575                 kr = vm_map_lookup_locked(&map, vaddr,
2576                                    fault_type & ~VM_PROT_WRITE, &version,
2577                                    &retry_object, &retry_offset, &retry_prot,
2578                                    &wired, &behavior, &lo_offset, &hi_offset,
2579                                    &pmap_map);
2580                 pmap = pmap_map->pmap;
2581
2582                 if (kr != KERN_SUCCESS) {
2583                         vm_map_unlock_read(map);
2584                         if(m != VM_PAGE_NULL) {
2585                                 vm_object_lock(m->object);
2586                                 RELEASE_PAGE(m);
2587                                 UNLOCK_AND_DEALLOCATE;
2588                         } else {
2589                                 vm_object_deallocate(object);
2590                         }
2591                         goto done;
2592                 }
2593
2594                 vm_object_unlock(retry_object);
2595                 if(m != VM_PAGE_NULL) {
2596                         vm_object_lock(m->object);
2597                 } else {
2598                         vm_object_lock(object);
2599                 }
2600
2601                 if ((retry_object != object) ||
2602                     (retry_offset != offset)) {
2603                         vm_map_unlock_read(map);
2604                         if(pmap_map != map)
2605                                 vm_map_unlock(pmap_map);
2606                         if(m != VM_PAGE_NULL) {
2607                                 RELEASE_PAGE(m);
2608                                 UNLOCK_AND_DEALLOCATE;
2609                         } else {
2610                                 vm_object_deallocate(object);
2611                         }
2612                         goto RetryFault;
2613                 }
2614
2615                 /*
2616                  *      Check whether the protection has changed or the object
2617                  *      has been copied while we left the map unlocked.
2618                  */
2619                 prot &= retry_prot;
2620                 if(m != VM_PAGE_NULL) {
2621                         vm_object_unlock(m->object);
2622                 } else {
2623                         vm_object_unlock(object);
2624                 }
2625         }
2626         if(m != VM_PAGE_NULL) {
2627                 vm_object_lock(m->object);
2628         } else {
2629                 vm_object_lock(object);
2630         }
2631
2632         /*
2633          *      If the copy object changed while the top-level object
2634          *      was unlocked, then we must take away write permission.
2635          */
2636
2637         if(m != VM_PAGE_NULL) {
2638                 if (m->object->copy != old_copy_object)
2639                         prot &= ~VM_PROT_WRITE;
2640         }
2641
2642         /*
2643          *      If we want to wire down this page, but no longer have
2644          *      adequate permissions, we must start all over.
2645          */
2646
2647         if (wired && (fault_type != (prot|VM_PROT_WRITE))) {
2648                 vm_map_verify_done(map, &version);
2649                 if(pmap_map != map)
2650                         vm_map_unlock(pmap_map);
2651                 if(m != VM_PAGE_NULL) {
2652                         RELEASE_PAGE(m);
2653                         UNLOCK_AND_DEALLOCATE;
2654                 } else {
2655                         vm_object_deallocate(object);
2656                 }
2657                 goto RetryFault;
2658         }
2659
2660         /*
2661          *      Put this page into the physical map.
2662          *      We had to do the unlock above because pmap_enter
2663          *      may cause other faults.  The page may be on
2664          *      the pageout queues.  If the pageout daemon comes
2665          *      across the page, it will remove it from the queues.
2666          */
2667         if (m != VM_PAGE_NULL) {
2668                 if (m->no_isync == TRUE) {
2669                         pmap_sync_caches_phys(m->phys_addr);
2670
2671                         m->no_isync = FALSE;
2672                 }
2673
2674                 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
2675
2676                 if(caller_pmap) {
2677                         PMAP_ENTER(caller_pmap,
2678                                         caller_pmap_addr, m,
2679                                         prot, cache_attr, wired);
2680                 } else {
2681                         PMAP_ENTER(pmap, vaddr, m,
2682                                         prot, cache_attr, wired);
2683                 }
2684                 {
2685                         tws_hash_line_t line;
2686                         task_t          task;
2687                         kern_return_t   kr;
2688
2689                            task = current_task();
2690                            if((map != NULL) &&
2691                                 (task->dynamic_working_set != 0)
2692                                         && (object->private)) {
2693                                 vm_object_t     base_object;
2694                                 vm_object_offset_t      base_offset;
2695                                 base_object = m->object;
2696                                 base_offset = m->offset;
2697                                 while(base_object->shadow) {
2698                                    base_offset +=
2699                                         base_object->shadow_offset;
2700                                    base_object =
2701                                         base_object->shadow;
2702                                 }
2703                                 kr = tws_lookup((tws_hash_t)
2704                                         task->dynamic_working_set,
2705                                         base_offset, base_object, &line);
2706                                 if(kr == KERN_OPERATION_TIMED_OUT){
2707                                         write_startup_file = 1;
2708                                 } else if (kr != KERN_SUCCESS) {
2709                                         tws_insert((tws_hash_t)
2710                                            task->dynamic_working_set,
2711                                            base_offset, base_object,
2712                                            vaddr, pmap_map);
2713                                         kr = tws_insert((tws_hash_t)
2714                                                    task->dynamic_working_set,
2715                                                    base_offset, base_object,
2716                                                    vaddr, pmap_map);
2717                                         if(kr == KERN_NO_SPACE) {
2718                                                 vm_object_unlock(m->object);
2719                                                 tws_expand_working_set(
2720                                                    task->dynamic_working_set,
2721                                                    TWS_HASH_LINE_COUNT,
2722                                                    FALSE);
2723                                                 vm_object_lock(m->object);
2724                                         }
2725                                         if(kr == KERN_OPERATION_TIMED_OUT) {
2726                                                 write_startup_file = 1;
2727                                         }
2728                                 }
2729                         }
2730                 }
2731         } else {
2732
2733 #ifndef i386
2734                 int                     memattr;
2735                 struct  phys_entry      *pp;
2736                 vm_map_entry_t          entry;
2737                 vm_offset_t             laddr;
2738                 vm_offset_t             ldelta, hdelta;
2739
2740                 /*
2741                  * do a pmap block mapping from the physical address
2742                  * in the object
2743                  */
2744                 if(pp = pmap_find_physentry(
2745                         (vm_offset_t)object->shadow_offset)) {
2746                         memattr = ((pp->pte1 & 0x00000078) >> 3);
2747                 } else {
2748                         memattr = VM_WIMG_MASK & (int)object->wimg_bits;
2749                 }
2750
2751
2752                 /* While we do not worry about execution protection in */
2753                 /* general, we may be able to read device memory and   */
2754                 /* still not be able to execute it.  Here we check for */
2755                 /* the guarded bit.  If its set and we are attempting  */
2756                 /* to execute, we return with a protection failure.    */
2757
2758                 if((memattr & VM_MEM_GUARDED) &&
2759                         (full_fault_type & VM_PROT_EXECUTE)) {
2760                         vm_map_verify_done(map, &version);
2761                         if(pmap_map != map)
2762                                 vm_map_unlock(pmap_map);
2763                         vm_fault_cleanup(object, top_page);
2764                         vm_object_deallocate(object);
2765                         kr = KERN_PROTECTION_FAILURE;
2766                         goto done;
2767                 }
2768
2769
2770
2771                 if(pmap_map != map) {
2772                         vm_map_unlock(pmap_map);
2773                 }
2774                 if (original_map != map) {
2775                         vm_map_unlock_read(map);
2776                         vm_map_lock_read(original_map);
2777                         map = original_map;
2778                 }
2779                 pmap_map = map;
2780
2781                 laddr = vaddr;
2782                 hdelta = 0xFFFFF000;
2783                 ldelta = 0xFFFFF000;
2784
2785
2786                 while(vm_map_lookup_entry(map, laddr, &entry)) {
2787                         if(ldelta > (laddr - entry->vme_start))
2788                                 ldelta = laddr - entry->vme_start;
2789                         if(hdelta > (entry->vme_end - laddr))
2790                                 hdelta = entry->vme_end - laddr;
2791                         if(entry->is_sub_map) {
2792
2793                                 laddr = (laddr - entry->vme_start)
2794                                                         + entry->offset;
2795                                 vm_map_lock_read(entry->object.sub_map);
2796                                 if(map != pmap_map)
2797                                         vm_map_unlock_read(map);
2798                                 if(entry->use_pmap) {
2799                                         vm_map_unlock_read(pmap_map);
2800                                         pmap_map = entry->object.sub_map;
2801                                 }
2802                                 map = entry->object.sub_map;
2803
2804                         } else {
2805                                 break;
2806                         }
2807                 }
2808
2809                 if(vm_map_lookup_entry(map, laddr, &entry) &&
2810                                         (entry->object.vm_object != NULL) &&
2811                                         (entry->object.vm_object == object)) {
2812
2813
2814                         if(caller_pmap) {
2815                                 pmap_map_block(caller_pmap,
2816                                         caller_pmap_addr - ldelta,
2817                                         ((vm_offset_t)
2818                                     (entry->object.vm_object->shadow_offset))
2819                                         + entry->offset +
2820                                         (laddr - entry->vme_start) - ldelta,
2821                                 ldelta + hdelta, prot,
2822                                 memattr, 0); /* Set up a block mapped area */
2823                         } else {
2824                                 pmap_map_block(pmap_map->pmap, vaddr - ldelta,
2825                                 ((vm_offset_t)
2826                                     (entry->object.vm_object->shadow_offset))
2827                                         + entry->offset +
2828                                         (laddr - entry->vme_start) - ldelta,
2829                                 ldelta + hdelta, prot,
2830                                 memattr, 0); /* Set up a block mapped area */
2831                         }
2832                 }
2833 #else
2834 #ifdef notyet
2835                 if(caller_pmap) {
2836                         pmap_enter(caller_pmap, caller_pmap_addr,
2837                                 object->shadow_offset, prot, 0, TRUE);
2838                 } else {
2839                         pmap_enter(pmap, vaddr,
2840                                 object->shadow_offset, prot, 0, TRUE);
2841                 }
2842                         /* Map it in */
2843 #endif
2844 #endif
2845
2846         }
2847
2848         /*
2849          *      If the page is not wired down and isn't already
2850          *      on a pageout queue, then put it where the
2851          *      pageout daemon can find it.
2852          */
2853         if(m != VM_PAGE_NULL) {
2854                 vm_page_lock_queues();
2855
2856                 if (change_wiring) {
2857                         if (wired)
2858                                 vm_page_wire(m);
2859                         else
2860                                 vm_page_unwire(m);
2861                 }
2862 #if     VM_FAULT_STATIC_CONFIG
2863                 else {
2864                         if (!m->active && !m->inactive)
2865                                 vm_page_activate(m);
2866                         m->reference = TRUE;
2867                 }
2868 #else
2869                 else if (software_reference_bits) {
2870                         if (!m->active && !m->inactive)
2871                                 vm_page_activate(m);
2872                         m->reference = TRUE;
2873                 } else {
2874                         vm_page_activate(m);
2875                 }
2876 #endif
2877                 vm_page_unlock_queues();
2878         }
2879
2880         /*
2881          *      Unlock everything, and return
2882          */
2883
2884         vm_map_verify_done(map, &version);
2885         if(pmap_map != map)
2886                 vm_map_unlock(pmap_map);
2887         if(m != VM_PAGE_NULL) {
2888                 PAGE_WAKEUP_DONE(m);
2889                 UNLOCK_AND_DEALLOCATE;
2890         } else {
2891                 vm_fault_cleanup(object, top_page);
2892                 vm_object_deallocate(object);
2893         }
2894         kr = KERN_SUCCESS;
2895
2896 #undef  UNLOCK_AND_DEALLOCATE
2897 #undef  RELEASE_PAGE
2898
2899     done:
2900         if(write_startup_file)
2901                 tws_send_startup_info(current_task());
2902         if (funnel_set) {
2903                 thread_funnel_set( curflock, TRUE);
2904                 funnel_set = FALSE;
2905         }
2906         thread_interrupt_level(interruptible_state);
2907
2908         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2909                               vaddr,
2910                               type_of_fault & 0xff,
2911                               kr,
2912                               type_of_fault >> 8,
2913                               0);
2914
2915         return(kr);
2916 }
2917
2918 /*
2919  *      vm_fault_wire:
2920  *
2921  *      Wire down a range of virtual addresses in a map.
2922  */
2923 kern_return_t
2924 vm_fault_wire(
2925         vm_map_t        map,
2926         vm_map_entry_t  entry,
2927         pmap_t          pmap,
2928         vm_offset_t     pmap_addr)
2929 {
2930
2931         register vm_offset_t    va;
2932         register vm_offset_t    end_addr = entry->vme_end;
2933         register kern_return_t  rc;
2934
2935         assert(entry->in_transition);
2936
2937         if ((entry->object.vm_object != NULL) &&
2938                         !entry->is_sub_map &&
2939                         entry->object.vm_object->phys_contiguous) {
2940                 return KERN_SUCCESS;
2941         }
2942
2943         /*
2944          *      Inform the physical mapping system that the
2945          *      range of addresses may not fault, so that
2946          *      page tables and such can be locked down as well.
2947          */
2948
2949         pmap_pageable(pmap, pmap_addr,
2950                 pmap_addr + (end_addr - entry->vme_start), FALSE);
2951
2952         /*
2953          *      We simulate a fault to get the page and enter it
2954          *      in the physical map.
2955          */
2956
2957         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
2958                 if ((rc = vm_fault_wire_fast(
2959                         map, va, entry, pmap,
2960                         pmap_addr + (va - entry->vme_start)
2961                         )) != KERN_SUCCESS) {
2962                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
2963                                 (pmap == kernel_pmap) ?
2964                                         THREAD_UNINT : THREAD_ABORTSAFE,
2965                                 pmap, pmap_addr + (va - entry->vme_start));
2966                 }
2967
2968                 if (rc != KERN_SUCCESS) {
2969                         struct vm_map_entry     tmp_entry = *entry;
2970
2971                         /* unwire wired pages */
2972                         tmp_entry.vme_end = va;
2973                         vm_fault_unwire(map,
2974                                 &tmp_entry, FALSE, pmap, pmap_addr);
2975
2976                         return rc;
2977                 }
2978         }
2979         return KERN_SUCCESS;
2980 }
2981
2982 /*
2983  *      vm_fault_unwire:
2984  *
2985  *      Unwire a range of virtual addresses in a map.
2986  */
2987 void
2988 vm_fault_unwire(
2989         vm_map_t        map,
2990         vm_map_entry_t  entry,
2991         boolean_t       deallocate,
2992         pmap_t          pmap,
2993         vm_offset_t     pmap_addr)
2994 {
2995         register vm_offset_t    va;
2996         register vm_offset_t    end_addr = entry->vme_end;
2997         vm_object_t             object;
2998
2999         object = (entry->is_sub_map)
3000                         ? VM_OBJECT_NULL : entry->object.vm_object;
3001
3002         /*
3003          *      Since the pages are wired down, we must be able to
3004          *      get their mappings from the physical map system.
3005          */
3006
3007         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3008                 pmap_change_wiring(pmap,
3009                         pmap_addr + (va - entry->vme_start), FALSE);
3010
3011                 if (object == VM_OBJECT_NULL) {
3012                         (void) vm_fault(map, va, VM_PROT_NONE,
3013                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
3014                 } else if (object->phys_contiguous) {
3015                         continue;
3016                 } else {
3017                         vm_prot_t       prot;
3018                         vm_page_t       result_page;
3019                         vm_page_t       top_page;
3020                         vm_object_t     result_object;
3021                         vm_fault_return_t result;
3022
3023                         do {
3024                                 prot = VM_PROT_NONE;
3025
3026                                 vm_object_lock(object);
3027                                 vm_object_paging_begin(object);
3028                                 XPR(XPR_VM_FAULT,
3029                                         "vm_fault_unwire -> vm_fault_page\n",
3030                                         0,0,0,0,0);
3031                                 result = vm_fault_page(object,
3032                                                 entry->offset +
3033                                                   (va - entry->vme_start),
3034                                                 VM_PROT_NONE, TRUE,
3035                                                 THREAD_UNINT,
3036                                                 entry->offset,
3037                                                 entry->offset +
3038                                                        (entry->vme_end
3039                                                         - entry->vme_start),
3040                                                 entry->behavior,
3041                                                 &prot,
3042                                                 &result_page,
3043                                                 &top_page,
3044                                                 (int *)0,
3045                                                 0, map->no_zero_fill,
3046                                                 FALSE, NULL, 0);
3047                         } while (result == VM_FAULT_RETRY);
3048
3049                         if (result != VM_FAULT_SUCCESS)
3050                                 panic("vm_fault_unwire: failure");
3051
3052                         result_object = result_page->object;
3053                         if (deallocate) {
3054                                 assert(!result_page->fictitious);
3055                                 pmap_page_protect(result_page->phys_addr,
3056                                                 VM_PROT_NONE);
3057                                 VM_PAGE_FREE(result_page);
3058                         } else {
3059                                 vm_page_lock_queues();
3060                                 vm_page_unwire(result_page);
3061                                 vm_page_unlock_queues();
3062                                 PAGE_WAKEUP_DONE(result_page);
3063                         }
3064
3065                         vm_fault_cleanup(result_object, top_page);
3066                 }
3067         }
3068
3069         /*
3070          *      Inform the physical mapping system that the range
3071          *      of addresses may fault, so that page tables and
3072          *      such may be unwired themselves.
3073          */
3074
3075         pmap_pageable(pmap, pmap_addr,
3076                 pmap_addr + (end_addr - entry->vme_start), TRUE);
3077
3078 }
3079
3080 /*
3081  *      vm_fault_wire_fast:
3082  *
3083  *      Handle common case of a wire down page fault at the given address.
3084  *      If successful, the page is inserted into the associated physical map.
3085  *      The map entry is passed in to avoid the overhead of a map lookup.
3086  *
3087  *      NOTE: the given address should be truncated to the
3088  *      proper page address.
3089  *
3090  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
3091  *      a standard error specifying why the fault is fatal is returned.
3092  *
3093  *      The map in question must be referenced, and remains so.
3094  *      Caller has a read lock on the map.
3095  *
3096  *      This is a stripped version of vm_fault() for wiring pages.  Anything
3097  *      other than the common case will return KERN_FAILURE, and the caller
3098  *      is expected to call vm_fault().
3099  */
3100 kern_return_t
3101 vm_fault_wire_fast(
3102         vm_map_t        map,
3103         vm_offset_t     va,
3104         vm_map_entry_t  entry,
3105         pmap_t          pmap,
3106         vm_offset_t     pmap_addr)
3107 {
3108         vm_object_t             object;
3109         vm_object_offset_t      offset;
3110         register vm_page_t      m;
3111         vm_prot_t               prot;
3112         thread_act_t            thr_act;
3113         unsigned int            cache_attr;
3114
3115         VM_STAT(faults++);
3116
3117         if((thr_act=current_act()) && (thr_act->task != TASK_NULL))
3118           thr_act->task->faults++;
3119
3120 /*
3121  *      Recovery actions
3122  */
3123
3124 #undef  RELEASE_PAGE
3125 #define RELEASE_PAGE(m) {                               \
3126         PAGE_WAKEUP_DONE(m);                            \
3127         vm_page_lock_queues();                          \
3128         vm_page_unwire(m);                              \
3129         vm_page_unlock_queues();                        \
3130 }
3131
3132
3133 #undef  UNLOCK_THINGS
3134 #define UNLOCK_THINGS   {                               \
3135         object->paging_in_progress--;                   \
3136         vm_object_unlock(object);                       \
3137 }
3138
3139 #undef  UNLOCK_AND_DEALLOCATE
3140 #define UNLOCK_AND_DEALLOCATE   {                       \
3141         UNLOCK_THINGS;                                  \
3142         vm_object_deallocate(object);                   \
3143 }
3144 /*
3145  *      Give up and have caller do things the hard way.
3146  */
3147
3148 #define GIVE_UP {                                       \
3149         UNLOCK_AND_DEALLOCATE;                          \
3150         return(KERN_FAILURE);                           \
3151 }
3152
3153
3154         /*
3155          *      If this entry is not directly to a vm_object, bail out.
3156          */
3157         if (entry->is_sub_map)
3158                 return(KERN_FAILURE);
3159
3160         /*
3161          *      Find the backing store object and offset into it.
3162          */
3163
3164         object = entry->object.vm_object;
3165         offset = (va - entry->vme_start) + entry->offset;
3166         prot = entry->protection;
3167
3168         /*
3169          *      Make a reference to this object to prevent its
3170          *      disposal while we are messing with it.
3171          */
3172
3173         vm_object_lock(object);
3174         assert(object->ref_count > 0);
3175         object->ref_count++;
3176         vm_object_res_reference(object);
3177         object->paging_in_progress++;
3178
3179         /*
3180          *      INVARIANTS (through entire routine):
3181          *
3182          *      1)      At all times, we must either have the object
3183          *              lock or a busy page in some object to prevent
3184          *              some other thread from trying to bring in
3185          *              the same page.
3186          *
3187          *      2)      Once we have a busy page, we must remove it from
3188          *              the pageout queues, so that the pageout daemon
3189          *              will not grab it away.
3190          *
3191          */
3192
3193         /*
3194          *      Look for page in top-level object.  If it's not there or
3195          *      there's something going on, give up.
3196          */
3197         m = vm_page_lookup(object, offset);
3198         if ((m == VM_PAGE_NULL) || (m->busy) ||
3199             (m->unusual && ( m->error || m->restart || m->absent ||
3200                                 prot & m->page_lock))) {
3201
3202                 GIVE_UP;
3203         }
3204
3205         /*
3206          *      Wire the page down now.  All bail outs beyond this
3207          *      point must unwire the page.
3208          */
3209
3210         vm_page_lock_queues();
3211         vm_page_wire(m);
3212         vm_page_unlock_queues();
3213
3214         /*
3215          *      Mark page busy for other threads.
3216          */
3217         assert(!m->busy);
3218         m->busy = TRUE;
3219         assert(!m->absent);
3220
3221         /*
3222          *      Give up if the page is being written and there's a copy object
3223          */
3224         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3225                 RELEASE_PAGE(m);
3226                 GIVE_UP;
3227         }
3228
3229         /*
3230          *      Put this page into the physical map.
3231          *      We have to unlock the object because pmap_enter
3232          *      may cause other faults.
3233          */
3234         if (m->no_isync == TRUE) {
3235                 pmap_sync_caches_phys(m->phys_addr);
3236
3237                 m->no_isync = FALSE;
3238         }
3239
3240         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3241
3242         PMAP_ENTER(pmap, pmap_addr, m, prot, cache_attr, TRUE);
3243
3244         /*
3245          *      Unlock everything, and return
3246          */
3247
3248         PAGE_WAKEUP_DONE(m);
3249         UNLOCK_AND_DEALLOCATE;
3250
3251         return(KERN_SUCCESS);
3252
3253 }
3254
3255 /*
3256  *      Routine:        vm_fault_copy_cleanup
3257  *      Purpose:
3258  *              Release a page used by vm_fault_copy.
3259  */
3260
3261 void
3262 vm_fault_copy_cleanup(
3263         vm_page_t       page,
3264         vm_page_t       top_page)
3265 {
3266         vm_object_t     object = page->object;
3267
3268         vm_object_lock(object);
3269         PAGE_WAKEUP_DONE(page);
3270         vm_page_lock_queues();
3271         if (!page->active && !page->inactive)
3272                 vm_page_activate(page);
3273         vm_page_unlock_queues();
3274         vm_fault_cleanup(object, top_page);
3275 }
3276
3277 void
3278 vm_fault_copy_dst_cleanup(
3279         vm_page_t       page)
3280 {
3281         vm_object_t     object;
3282
3283         if (page != VM_PAGE_NULL) {
3284                 object = page->object;
3285                 vm_object_lock(object);
3286                 vm_page_lock_queues();
3287                 vm_page_unwire(page);
3288                 vm_page_unlock_queues();
3289                 vm_object_paging_end(object);
3290                 vm_object_unlock(object);
3291         }
3292 }
3293
3294 /*
3295  *      Routine:        vm_fault_copy
3296  *
3297  *      Purpose:
3298  *              Copy pages from one virtual memory object to another --
3299  *              neither the source nor destination pages need be resident.
3300  *
3301  *              Before actually copying a page, the version associated with
3302  *              the destination address map wil be verified.
3303  *
3304  *      In/out conditions:
3305  *              The caller must hold a reference, but not a lock, to
3306  *              each of the source and destination objects and to the
3307  *              destination map.
3308  *
3309  *      Results:
3310  *              Returns KERN_SUCCESS if no errors were encountered in
3311  *              reading or writing the data.  Returns KERN_INTERRUPTED if
3312  *              the operation was interrupted (only possible if the
3313  *              "interruptible" argument is asserted).  Other return values
3314  *              indicate a permanent error in copying the data.
3315  *
3316  *              The actual amount of data copied will be returned in the
3317  *              "copy_size" argument.  In the event that the destination map
3318  *              verification failed, this amount may be less than the amount
3319  *              requested.
3320  */
3321 kern_return_t
3322 vm_fault_copy(
3323         vm_object_t             src_object,
3324         vm_object_offset_t      src_offset,
3325         vm_size_t               *src_size,              /* INOUT */
3326         vm_object_t             dst_object,
3327         vm_object_offset_t      dst_offset,
3328         vm_map_t                dst_map,
3329         vm_map_version_t         *dst_version,
3330         int                     interruptible)
3331 {
3332         vm_page_t               result_page;
3333
3334         vm_page_t               src_page;
3335         vm_page_t               src_top_page;
3336         vm_prot_t               src_prot;
3337
3338         vm_page_t               dst_page;
3339         vm_page_t               dst_top_page;
3340         vm_prot_t               dst_prot;
3341
3342         vm_size_t               amount_left;
3343         vm_object_t             old_copy_object;
3344         kern_return_t           error = 0;
3345
3346         vm_size_t               part_size;
3347
3348         /*
3349          * In order not to confuse the clustered pageins, align
3350          * the different offsets on a page boundary.
3351          */
3352         vm_object_offset_t      src_lo_offset = trunc_page_64(src_offset);
3353         vm_object_offset_t      dst_lo_offset = trunc_page_64(dst_offset);
3354         vm_object_offset_t      src_hi_offset = round_page_64(src_offset + *src_size);
3355         vm_object_offset_t      dst_hi_offset = round_page_64(dst_offset + *src_size);
3356
3357 #define RETURN(x)                                       \
3358         MACRO_BEGIN                                     \
3359         *src_size -= amount_left;                       \
3360         MACRO_RETURN(x);                                \
3361         MACRO_END
3362
3363         amount_left = *src_size;
3364         do { /* while (amount_left > 0) */
3365                 /*
3366                  * There may be a deadlock if both source and destination
3367                  * pages are the same. To avoid this deadlock, the copy must
3368                  * start by getting the destination page in order to apply
3369                  * COW semantics if any.
3370                  */
3371
3372         RetryDestinationFault: ;
3373
3374                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3375
3376                 vm_object_lock(dst_object);
3377                 vm_object_paging_begin(dst_object);
3378
3379                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3380                 switch (vm_fault_page(dst_object,
3381                                       trunc_page_64(dst_offset),
3382                                       VM_PROT_WRITE|VM_PROT_READ,
3383                                       FALSE,
3384                                       interruptible,
3385                                       dst_lo_offset,
3386                                       dst_hi_offset,
3387                                       VM_BEHAVIOR_SEQUENTIAL,
3388                                       &dst_prot,
3389                                       &dst_page,
3390                                       &dst_top_page,
3391                                       (int *)0,
3392                                       &error,
3393                                       dst_map->no_zero_fill,
3394                                       FALSE, NULL, 0)) {
3395                 case VM_FAULT_SUCCESS:
3396                         break;
3397                 case VM_FAULT_RETRY:
3398                         goto RetryDestinationFault;
3399                 case VM_FAULT_MEMORY_SHORTAGE:
3400                         if (vm_page_wait(interruptible))
3401                                 goto RetryDestinationFault;
3402                         /* fall thru */
3403                 case VM_FAULT_INTERRUPTED:
3404                         RETURN(MACH_SEND_INTERRUPTED);
3405                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3406                         vm_page_more_fictitious();
3407                         goto RetryDestinationFault;
3408                 case VM_FAULT_MEMORY_ERROR:
3409                         if (error)
3410                                 return (error);
3411                         else
3412                                 return(KERN_MEMORY_ERROR);
3413                 }
3414                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3415
3416                 old_copy_object = dst_page->object->copy;
3417
3418                 /*
3419                  * There exists the possiblity that the source and
3420                  * destination page are the same.  But we can't
3421                  * easily determine that now.  If they are the
3422                  * same, the call to vm_fault_page() for the
3423                  * destination page will deadlock.  To prevent this we
3424                  * wire the page so we can drop busy without having
3425                  * the page daemon steal the page.  We clean up the
3426                  * top page  but keep the paging reference on the object
3427                  * holding the dest page so it doesn't go away.
3428                  */
3429
3430                 vm_page_lock_queues();
3431                 vm_page_wire(dst_page);
3432                 vm_page_unlock_queues();
3433                 PAGE_WAKEUP_DONE(dst_page);
3434                 vm_object_unlock(dst_page->object);
3435
3436                 if (dst_top_page != VM_PAGE_NULL) {
3437                         vm_object_lock(dst_object);
3438                         VM_PAGE_FREE(dst_top_page);
3439                         vm_object_paging_end(dst_object);
3440                         vm_object_unlock(dst_object);
3441                 }
3442
3443         RetrySourceFault: ;
3444
3445                 if (src_object == VM_OBJECT_NULL) {
3446                         /*
3447                          *      No source object.  We will just
3448                          *      zero-fill the page in dst_object.
3449                          */
3450                         src_page = VM_PAGE_NULL;
3451                         result_page = VM_PAGE_NULL;
3452                 } else {
3453                         vm_object_lock(src_object);
3454                         src_page = vm_page_lookup(src_object,
3455                                                   trunc_page_64(src_offset));
3456                         if (src_page == dst_page) {
3457                                 src_prot = dst_prot;
3458                                 result_page = VM_PAGE_NULL;
3459                         } else {
3460                                 src_prot = VM_PROT_READ;
3461                                 vm_object_paging_begin(src_object);
3462
3463                                 XPR(XPR_VM_FAULT,
3464                                         "vm_fault_copy(2) -> vm_fault_page\n",
3465                                         0,0,0,0,0);
3466                                 switch (vm_fault_page(src_object,
3467                                                       trunc_page_64(src_offset),
3468                                                       VM_PROT_READ,
3469                                                       FALSE,
3470                                                       interruptible,
3471                                                       src_lo_offset,
3472                                                       src_hi_offset,
3473                                                       VM_BEHAVIOR_SEQUENTIAL,
3474                                                       &src_prot,
3475                                                       &result_page,
3476                                                       &src_top_page,
3477                                                       (int *)0,
3478                                                       &error,
3479                                                       FALSE,
3480                                                       FALSE, NULL, 0)) {
3481
3482                                 case VM_FAULT_SUCCESS:
3483                                         break;
3484                                 case VM_FAULT_RETRY:
3485                                         goto RetrySourceFault;
3486                                 case VM_FAULT_MEMORY_SHORTAGE:
3487                                         if (vm_page_wait(interruptible))
3488                                                 goto RetrySourceFault;
3489                                         /* fall thru */
3490                                 case VM_FAULT_INTERRUPTED:
3491                                         vm_fault_copy_dst_cleanup(dst_page);
3492                                         RETURN(MACH_SEND_INTERRUPTED);
3493                                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3494                                         vm_page_more_fictitious();
3495                                         goto RetrySourceFault;
3496                                 case VM_FAULT_MEMORY_ERROR:
3497                                         vm_fault_copy_dst_cleanup(dst_page);
3498                                         if (error)
3499                                                 return (error);
3500                                         else
3501                                                 return(KERN_MEMORY_ERROR);
3502                                 }
3503
3504
3505                                 assert((src_top_page == VM_PAGE_NULL) ==
3506                                        (result_page->object == src_object));
3507                         }
3508                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3509                         vm_object_unlock(result_page->object);
3510                 }
3511
3512                 if (!vm_map_verify(dst_map, dst_version)) {
3513                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3514                                 vm_fault_copy_cleanup(result_page, src_top_page);
3515                         vm_fault_copy_dst_cleanup(dst_page);
3516                         break;
3517                 }
3518
3519                 vm_object_lock(dst_page->object);
3520
3521                 if (dst_page->object->copy != old_copy_object) {
3522                         vm_object_unlock(dst_page->object);
3523                         vm_map_verify_done(dst_map, dst_version);
3524                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3525                                 vm_fault_copy_cleanup(result_page, src_top_page);
3526                         vm_fault_copy_dst_cleanup(dst_page);
3527                         break;
3528                 }
3529                 vm_object_unlock(dst_page->object);
3530
3531                 /*
3532                  *      Copy the page, and note that it is dirty
3533                  *      immediately.
3534                  */
3535
3536                 if (!page_aligned(src_offset) ||
3537                         !page_aligned(dst_offset) ||
3538                         !page_aligned(amount_left)) {
3539
3540                         vm_object_offset_t      src_po,
3541                                                 dst_po;
3542
3543                         src_po = src_offset - trunc_page_64(src_offset);
3544                         dst_po = dst_offset - trunc_page_64(dst_offset);
3545
3546                         if (dst_po > src_po) {
3547                                 part_size = PAGE_SIZE - dst_po;
3548                         } else {
3549                                 part_size = PAGE_SIZE - src_po;
3550                         }
3551                         if (part_size > (amount_left)){
3552                                 part_size = amount_left;
3553                         }
3554
3555                         if (result_page == VM_PAGE_NULL) {
3556                                 vm_page_part_zero_fill(dst_page,
3557                                                         dst_po, part_size);
3558                         } else {
3559                                 vm_page_part_copy(result_page, src_po,
3560                                         dst_page, dst_po, part_size);
3561                                 if(!dst_page->dirty){
3562                                         vm_object_lock(dst_object);
3563                                         dst_page->dirty = TRUE;
3564                                         vm_object_unlock(dst_page->object);
3565                                 }
3566
3567                         }
3568                 } else {
3569                         part_size = PAGE_SIZE;
3570
3571                         if (result_page == VM_PAGE_NULL)
3572                                 vm_page_zero_fill(dst_page);
3573                         else{
3574                                 vm_page_copy(result_page, dst_page);
3575                                 if(!dst_page->dirty){
3576                                         vm_object_lock(dst_object);
3577                                         dst_page->dirty = TRUE;
3578                                         vm_object_unlock(dst_page->object);
3579                                 }
3580                         }
3581
3582                 }
3583
3584                 /*
3585                  *      Unlock everything, and return
3586                  */
3587
3588                 vm_map_verify_done(dst_map, dst_version);
3589
3590                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3591                         vm_fault_copy_cleanup(result_page, src_top_page);
3592                 vm_fault_copy_dst_cleanup(dst_page);
3593
3594                 amount_left -= part_size;
3595                 src_offset += part_size;
3596                 dst_offset += part_size;
3597         } while (amount_left > 0);
3598
3599         RETURN(KERN_SUCCESS);
3600 #undef  RETURN
3601
3602         /*NOTREACHED*/
3603 }
3604
3605 #ifdef  notdef
3606
3607 /*
3608  *      Routine:        vm_fault_page_overwrite
3609  *
3610  *      Description:
3611  *              A form of vm_fault_page that assumes that the
3612  *              resulting page will be overwritten in its entirety,
3613  *              making it unnecessary to obtain the correct *contents*
3614  *              of the page.
3615  *
3616  *      Implementation:
3617  *              XXX Untested.  Also unused.  Eventually, this technology
3618  *              could be used in vm_fault_copy() to advantage.
3619  */
3620 vm_fault_return_t
3621 vm_fault_page_overwrite(
3622         register
3623         vm_object_t             dst_object,
3624         vm_object_offset_t      dst_offset,
3625         vm_page_t               *result_page)   /* OUT */
3626 {
3627         register
3628         vm_page_t       dst_page;
3629         kern_return_t   wait_result;
3630
3631 #define interruptible   THREAD_UNINT    /* XXX */
3632
3633         while (TRUE) {
3634                 /*
3635                  *      Look for a page at this offset
3636                  */
3637
3638                 while ((dst_page = vm_page_lookup(dst_object, dst_offset))
3639                                  == VM_PAGE_NULL) {
3640                         /*
3641                          *      No page, no problem... just allocate one.
3642                          */
3643
3644                         dst_page = vm_page_alloc(dst_object, dst_offset);
3645                         if (dst_page == VM_PAGE_NULL) {
3646                                 vm_object_unlock(dst_object);
3647                                 VM_PAGE_WAIT();
3648                                 vm_object_lock(dst_object);
3649                                 continue;
3650                         }
3651
3652                         /*
3653                          *      Pretend that the memory manager
3654                          *      write-protected the page.
3655                          *
3656                          *      Note that we will be asking for write
3657                          *      permission without asking for the data
3658                          *      first.
3659                          */
3660
3661                         dst_page->overwriting = TRUE;
3662                         dst_page->page_lock = VM_PROT_WRITE;
3663                         dst_page->absent = TRUE;
3664                         dst_page->unusual = TRUE;
3665                         dst_object->absent_count++;
3666
3667                         break;
3668
3669                         /*
3670                          *      When we bail out, we might have to throw
3671                          *      away the page created here.
3672                          */
3673
3674 #define DISCARD_PAGE                                            \
3675         MACRO_BEGIN                                             \
3676         vm_object_lock(dst_object);                             \
3677         dst_page = vm_page_lookup(dst_object, dst_offset);      \
3678         if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \
3679                 VM_PAGE_FREE(dst_page);                         \
3680         vm_object_unlock(dst_object);                           \
3681         MACRO_END
3682                 }
3683
3684                 /*
3685                  *      If the page is write-protected...
3686                  */
3687
3688                 if (dst_page->page_lock & VM_PROT_WRITE) {
3689                         /*
3690                          *      ... and an unlock request hasn't been sent
3691                          */
3692
3693                         if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) {
3694                                 vm_prot_t       u;
3695                                 kern_return_t   rc;
3696
3697                                 /*
3698                                  *      ... then send one now.
3699                                  */
3700
3701                                 if (!dst_object->pager_ready) {
3702                                         wait_result = vm_object_assert_wait(dst_object,
3703                                                                 VM_OBJECT_EVENT_PAGER_READY,
3704                                                                 interruptible);
3705                                         vm_object_unlock(dst_object);
3706                                         if (wait_result == THREAD_WAITING)
3707                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
3708                                         if (wait_result != THREAD_AWAKENED) {
3709                                                 DISCARD_PAGE;
3710                                                 return(VM_FAULT_INTERRUPTED);
3711                                         }
3712                                         continue;
3713                                 }
3714
3715                                 u = dst_page->unlock_request |= VM_PROT_WRITE;
3716                                 vm_object_unlock(dst_object);
3717
3718                                 if ((rc = memory_object_data_unlock(
3719                                                 dst_object->pager,
3720                                                 dst_offset + dst_object->paging_offset,
3721                                                 PAGE_SIZE,
3722                                                 u)) != KERN_SUCCESS) {
3723                                         if (vm_fault_debug)
3724                                             printf("vm_object_overwrite: memory_object_data_unlock failed\n");
3725                                         DISCARD_PAGE;
3726                                         return((rc == MACH_SEND_INTERRUPTED) ?
3727                                                 VM_FAULT_INTERRUPTED :
3728                                                 VM_FAULT_MEMORY_ERROR);
3729                                 }
3730                                 vm_object_lock(dst_object);
3731                                 continue;
3732                         }
3733
3734                         /* ... fall through to wait below */
3735                 } else {
3736                         /*
3737                          *      If the page isn't being used for other
3738                          *      purposes, then we're done.
3739                          */
3740                         if ( ! (dst_page->busy || dst_page->absent ||
3741                                 dst_page->error || dst_page->restart) )
3742                                 break;
3743                 }
3744
3745                 wait_result = PAGE_ASSERT_WAIT(dst_page, interruptible);
3746                 vm_object_unlock(dst_object);
3747                 if (wait_result == THREAD_WAITING)
3748                         wait_result = thread_block(THREAD_CONTINUE_NULL);
3749                 if (wait_result != THREAD_AWAKENED) {
3750                         DISCARD_PAGE;
3751                         return(VM_FAULT_INTERRUPTED);
3752                 }
3753         }
3754
3755         *result_page = dst_page;
3756         return(VM_FAULT_SUCCESS);
3757
3758 #undef  interruptible
3759 #undef  DISCARD_PAGE
3760 }
3761
3762 #endif  /* notdef */
3763
3764 #if     VM_FAULT_CLASSIFY
3765 /*
3766  *      Temporary statistics gathering support.
3767  */
3768
3769 /*
3770  *      Statistics arrays:
3771  */
3772 #define VM_FAULT_TYPES_MAX      5
3773 #define VM_FAULT_LEVEL_MAX      8
3774
3775 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
3776
3777 #define VM_FAULT_TYPE_ZERO_FILL 0
3778 #define VM_FAULT_TYPE_MAP_IN    1
3779 #define VM_FAULT_TYPE_PAGER     2
3780 #define VM_FAULT_TYPE_COPY      3
3781 #define VM_FAULT_TYPE_OTHER     4
3782
3783
3784 void
3785 vm_fault_classify(vm_object_t           object,
3786                   vm_object_offset_t    offset,
3787                   vm_prot_t             fault_type)
3788 {
3789         int             type, level = 0;
3790         vm_page_t       m;
3791
3792         while (TRUE) {
3793                 m = vm_page_lookup(object, offset);
3794                 if (m != VM_PAGE_NULL) {
3795                         if (m->busy || m->error || m->restart || m->absent ||
3796                             fault_type & m->page_lock) {
3797                                 type = VM_FAULT_TYPE_OTHER;
3798                                 break;
3799                         }
3800                         if (((fault_type & VM_PROT_WRITE) == 0) ||
3801                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
3802                                 type = VM_FAULT_TYPE_MAP_IN;
3803                                 break;
3804                         }
3805                         type = VM_FAULT_TYPE_COPY;
3806                         break;
3807                 }
3808                 else {
3809                         if (object->pager_created) {
3810                                 type = VM_FAULT_TYPE_PAGER;
3811                                 break;
3812                         }
3813                         if (object->shadow == VM_OBJECT_NULL) {
3814                                 type = VM_FAULT_TYPE_ZERO_FILL;
3815                                 break;
3816                         }
3817
3818                         offset += object->shadow_offset;
3819                         object = object->shadow;
3820                         level++;
3821                         continue;
3822                 }
3823         }
3824
3825         if (level > VM_FAULT_LEVEL_MAX)
3826                 level = VM_FAULT_LEVEL_MAX;
3827
3828         vm_fault_stats[type][level] += 1;
3829
3830         return;
3831 }
3832
3833 /* cleanup routine to call from debugger */
3834
3835 void
3836 vm_fault_classify_init(void)
3837 {
3838         int type, level;
3839
3840         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
3841                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
3842                         vm_fault_stats[type][level] = 0;
3843                 }
3844         }
3845
3846         return;
3847 }
3848 #endif  /* VM_FAULT_CLASSIFY */