osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*
  23  * @OSF_COPYRIGHT@
  24  */
  25 /*
  26  * Mach Operating System
  27  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  28  * All Rights Reserved.
  29  *
  30  * Permission to use, copy, modify and distribute this software and its
  31  * documentation is hereby granted, provided that both the copyright
  32  * notice and this permission notice appear in all copies of the
  33  * software, derivative works or modified versions, and any portions
  34  * thereof, and that both notices appear in supporting documentation.
  35  *
  36  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  37  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  38  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  39  *
  40  * Carnegie Mellon requests users of this software to return to
  41  *
  42  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  43  *  School of Computer Science
  44  *  Carnegie Mellon University
  45  *  Pittsburgh PA 15213-3890
  46  *
  47  * any improvements or extensions that they make and grant Carnegie Mellon
  48  * the rights to redistribute these changes.
  49  */
  50 /*
  51  */
  52 /*
  53  *      File:   vm_fault.c
  54  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  55  *
  56  *      Page fault handling module.
  57  */
  58 #ifdef MACH_BSD
  59 /* remove after component interface available */
  60 extern int      vnode_pager_workaround;
  61 extern int      device_pager_workaround;
  62 #endif
  63
  64 #include <mach_cluster_stats.h>
  65 #include <mach_pagemap.h>
  66 #include <mach_kdb.h>
  67
  68 #include <vm/vm_fault.h>
  69 #include <mach/kern_return.h>
  70 #include <mach/message.h>       /* for error codes */
  71 #include <kern/host_statistics.h>
  72 #include <kern/counters.h>
  73 #include <kern/task.h>
  74 #include <kern/thread.h>
  75 #include <kern/sched_prim.h>
  76 #include <kern/host.h>
  77 #include <kern/xpr.h>
  78 #include <ppc/proc_reg.h>
  79 #include <ppc/pmap_internals.h>
  80 #include <vm/task_working_set.h>
  81 #include <vm/vm_map.h>
  82 #include <vm/vm_object.h>
  83 #include <vm/vm_page.h>
  84 #include <vm/pmap.h>
  85 #include <vm/vm_pageout.h>
  86 #include <mach/vm_param.h>
  87 #include <mach/vm_behavior.h>
  88 #include <mach/memory_object.h>
  89                                 /* For memory_object_data_{request,unlock} */
  90 #include <kern/mach_param.h>
  91 #include <kern/macro_help.h>
  92 #include <kern/zalloc.h>
  93 #include <kern/misc_protos.h>
  94
  95 #include <sys/kdebug.h>
  96
  97 #define VM_FAULT_CLASSIFY       0
  98 #define VM_FAULT_STATIC_CONFIG  1
  99
 100 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 101
 102 int             vm_object_absent_max = 50;
 103
 104 int             vm_fault_debug = 0;
 105 boolean_t       vm_page_deactivate_behind = TRUE;
 106
 107
 108 #if     !VM_FAULT_STATIC_CONFIG
 109 boolean_t       vm_fault_dirty_handling = FALSE;
 110 boolean_t       vm_fault_interruptible = FALSE;
 111 boolean_t       software_reference_bits = TRUE;
 112 #endif
 113
 114 #if     MACH_KDB
 115 extern struct db_watchpoint *db_watchpoint_list;
 116 #endif  /* MACH_KDB */
 117
 118 /* Forward declarations of internal routines. */
 119 extern kern_return_t vm_fault_wire_fast(
 120                                 vm_map_t        map,
 121                                 vm_offset_t     va,
 122                                 vm_map_entry_t  entry,
 123                                 pmap_t          pmap);
 124
 125 extern void vm_fault_continue(void);
 126
 127 extern void vm_fault_copy_cleanup(
 128                                 vm_page_t       page,
 129                                 vm_page_t       top_page);
 130
 131 extern void vm_fault_copy_dst_cleanup(
 132                                 vm_page_t       page);
 133
 134 #if     VM_FAULT_CLASSIFY
 135 extern void vm_fault_classify(vm_object_t       object,
 136                           vm_object_offset_t    offset,
 137                           vm_prot_t             fault_type);
 138
 139 extern void vm_fault_classify_init(void);
 140 #endif
 141
 142 /*
 143  *      Routine:        vm_fault_init
 144  *      Purpose:
 145  *              Initialize our private data structures.
 146  */
 147 void
 148 vm_fault_init(void)
 149 {
 150 }
 151
 152 /*
 153  *      Routine:        vm_fault_cleanup
 154  *      Purpose:
 155  *              Clean up the result of vm_fault_page.
 156  *      Results:
 157  *              The paging reference for "object" is released.
 158  *              "object" is unlocked.
 159  *              If "top_page" is not null,  "top_page" is
 160  *              freed and the paging reference for the object
 161  *              containing it is released.
 162  *
 163  *      In/out conditions:
 164  *              "object" must be locked.
 165  */
 166 void
 167 vm_fault_cleanup(
 168         register vm_object_t    object,
 169         register vm_page_t      top_page)
 170 {
 171         vm_object_paging_end(object);
 172         vm_object_unlock(object);
 173
 174         if (top_page != VM_PAGE_NULL) {
 175             object = top_page->object;
 176             vm_object_lock(object);
 177             VM_PAGE_FREE(top_page);
 178             vm_object_paging_end(object);
 179             vm_object_unlock(object);
 180         }
 181 }
 182
 183 #if     MACH_CLUSTER_STATS
 184 #define MAXCLUSTERPAGES 16
 185 struct {
 186         unsigned long pages_in_cluster;
 187         unsigned long pages_at_higher_offsets;
 188         unsigned long pages_at_lower_offsets;
 189 } cluster_stats_in[MAXCLUSTERPAGES];
 190 #define CLUSTER_STAT(clause)    clause
 191 #define CLUSTER_STAT_HIGHER(x)  \
 192         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 193 #define CLUSTER_STAT_LOWER(x)   \
 194          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 195 #define CLUSTER_STAT_CLUSTER(x) \
 196         ((cluster_stats_in[(x)].pages_in_cluster)++)
 197 #else   /* MACH_CLUSTER_STATS */
 198 #define CLUSTER_STAT(clause)
 199 #endif  /* MACH_CLUSTER_STATS */
 200
 201 /* XXX - temporary */
 202 boolean_t vm_allow_clustered_pagein = FALSE;
 203 int vm_pagein_cluster_used = 0;
 204
 205 /*
 206  * Prepage default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 207  */
 208 int vm_default_ahead = 1;       /* Number of pages to prepage ahead */
 209 int vm_default_behind = 0;      /* Number of pages to prepage behind */
 210
 211 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 212
 213 /*
 214  *      Routine:        vm_fault_page
 215  *      Purpose:
 216  *              Find the resident page for the virtual memory
 217  *              specified by the given virtual memory object
 218  *              and offset.
 219  *      Additional arguments:
 220  *              The required permissions for the page is given
 221  *              in "fault_type".  Desired permissions are included
 222  *              in "protection".  The minimum and maximum valid offsets
 223  *              within the object for the relevant map entry are
 224  *              passed in "lo_offset" and "hi_offset" respectively and
 225  *              the expected page reference pattern is passed in "behavior".
 226  *              These three parameters are used to determine pagein cluster
 227  *              limits.
 228  *
 229  *              If the desired page is known to be resident (for
 230  *              example, because it was previously wired down), asserting
 231  *              the "unwiring" parameter will speed the search.
 232  *
 233  *              If the operation can be interrupted (by thread_abort
 234  *              or thread_terminate), then the "interruptible"
 235  *              parameter should be asserted.
 236  *
 237  *      Results:
 238  *              The page containing the proper data is returned
 239  *              in "result_page".
 240  *
 241  *      In/out conditions:
 242  *              The source object must be locked and referenced,
 243  *              and must donate one paging reference.  The reference
 244  *              is not affected.  The paging reference and lock are
 245  *              consumed.
 246  *
 247  *              If the call succeeds, the object in which "result_page"
 248  *              resides is left locked and holding a paging reference.
 249  *              If this is not the original object, a busy page in the
 250  *              original object is returned in "top_page", to prevent other
 251  *              callers from pursuing this same data, along with a paging
 252  *              reference for the original object.  The "top_page" should
 253  *              be destroyed when this guarantee is no longer required.
 254  *              The "result_page" is also left busy.  It is not removed
 255  *              from the pageout queues.
 256  */
 257
 258 vm_fault_return_t
 259 vm_fault_page(
 260         /* Arguments: */
 261         vm_object_t     first_object,   /* Object to begin search */
 262         vm_object_offset_t first_offset,        /* Offset into object */
 263         vm_prot_t       fault_type,     /* What access is requested */
 264         boolean_t       must_be_resident,/* Must page be resident? */
 265         int             interruptible,  /* how may fault be interrupted? */
 266         vm_object_offset_t lo_offset,   /* Map entry start */
 267         vm_object_offset_t hi_offset,   /* Map entry end */
 268         vm_behavior_t   behavior,       /* Page reference behavior */
 269         /* Modifies in place: */
 270         vm_prot_t       *protection,    /* Protection for mapping */
 271         /* Returns: */
 272         vm_page_t       *result_page,   /* Page found, if successful */
 273         vm_page_t       *top_page,      /* Page in top object, if
 274                                          * not result_page.  */
 275         int             *type_of_fault, /* if non-null, fill in with type of fault
 276                                          * COW, zero-fill, etc... returned in trace point */
 277         /* More arguments: */
 278         kern_return_t   *error_code,    /* code if page is in error */
 279         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 280         boolean_t       data_supply,    /* treat as data_supply if
 281                                          * it is a write fault and a full
 282                                          * page is provided */
 283         vm_map_t        map,
 284         vm_offset_t     vaddr)
 285 {
 286         register
 287         vm_page_t               m;
 288         register
 289         vm_object_t             object;
 290         register
 291         vm_object_offset_t      offset;
 292         vm_page_t               first_m;
 293         vm_object_t             next_object;
 294         vm_object_t             copy_object;
 295         boolean_t               look_for_page;
 296         vm_prot_t               access_required = fault_type;
 297         vm_prot_t               wants_copy_flag;
 298         vm_size_t               cluster_size, length;
 299         vm_object_offset_t      cluster_offset;
 300         vm_object_offset_t      cluster_start, cluster_end, paging_offset;
 301         vm_object_offset_t      align_offset;
 302         CLUSTER_STAT(int pages_at_higher_offsets;)
 303         CLUSTER_STAT(int pages_at_lower_offsets;)
 304         kern_return_t   wait_result;
 305         thread_t                cur_thread;
 306         boolean_t               interruptible_state;
 307         boolean_t               bumped_pagein = FALSE;
 308
 309
 310 #if     MACH_PAGEMAP
 311 /*
 312  * MACH page map - an optional optimization where a bit map is maintained
 313  * by the VM subsystem for internal objects to indicate which pages of
 314  * the object currently reside on backing store.  This existence map
 315  * duplicates information maintained by the vnode pager.  It is
 316  * created at the time of the first pageout against the object, i.e.
 317  * at the same time pager for the object is created.  The optimization
 318  * is designed to eliminate pager interaction overhead, if it is
 319  * 'known' that the page does not exist on backing store.
 320  *
 321  * LOOK_FOR() evaluates to TRUE if the page specified by object/offset is
 322  * either marked as paged out in the existence map for the object or no
 323  * existence map exists for the object.  LOOK_FOR() is one of the
 324  * criteria in the decision to invoke the pager.   It is also used as one
 325  * of the criteria to terminate the scan for adjacent pages in a clustered
 326  * pagein operation.  Note that LOOK_FOR() always evaluates to TRUE for
 327  * permanent objects.  Note also that if the pager for an internal object
 328  * has not been created, the pager is not invoked regardless of the value
 329  * of LOOK_FOR() and that clustered pagein scans are only done on an object
 330  * for which a pager has been created.
 331  *
 332  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 333  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 334  * PAGED_OUT() is used to determine if a page has already been pushed
 335  * into a copy object in order to avoid a redundant page out operation.
 336  */
 337 #define LOOK_FOR(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 338                         != VM_EXTERNAL_STATE_ABSENT)
 339 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 340                         == VM_EXTERNAL_STATE_EXISTS)
 341 #else /* MACH_PAGEMAP */
 342 /*
 343  * If the MACH page map optimization is not enabled,
 344  * LOOK_FOR() always evaluates to TRUE.  The pager will always be
 345  * invoked to resolve missing pages in an object, assuming the pager
 346  * has been created for the object.  In a clustered page operation, the
 347  * absence of a page on backing backing store cannot be used to terminate
 348  * a scan for adjacent pages since that information is available only in
 349  * the pager.  Hence pages that may not be paged out are potentially
 350  * included in a clustered request.  The vnode pager is coded to deal
 351  * with any combination of absent/present pages in a clustered
 352  * pagein request.  PAGED_OUT() always evaluates to FALSE, i.e. the pager
 353  * will always be invoked to push a dirty page into a copy object assuming
 354  * a pager has been created.  If the page has already been pushed, the
 355  * pager will ingore the new request.
 356  */
 357 #define LOOK_FOR(o, f) TRUE
 358 #define PAGED_OUT(o, f) FALSE
 359 #endif /* MACH_PAGEMAP */
 360
 361 /*
 362  *      Recovery actions
 363  */
 364 #define PREPARE_RELEASE_PAGE(m)                         \
 365         MACRO_BEGIN                                     \
 366         vm_page_lock_queues();                          \
 367         MACRO_END
 368
 369 #define DO_RELEASE_PAGE(m)                              \
 370         MACRO_BEGIN                                     \
 371         PAGE_WAKEUP_DONE(m);                            \
 372         if (!m->active && !m->inactive)                 \
 373                 vm_page_activate(m);                    \
 374         vm_page_unlock_queues();                        \
 375         MACRO_END
 376
 377 #define RELEASE_PAGE(m)                                 \
 378         MACRO_BEGIN                                     \
 379         PREPARE_RELEASE_PAGE(m);                        \
 380         DO_RELEASE_PAGE(m);                             \
 381         MACRO_END
 382
 383 #if TRACEFAULTPAGE
 384         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 385 #endif
 386
 387
 388
 389 #if     !VM_FAULT_STATIC_CONFIG
 390         if (vm_fault_dirty_handling
 391 #if     MACH_KDB
 392                 /*
 393                  *      If there are watchpoints set, then
 394                  *      we don't want to give away write permission
 395                  *      on a read fault.  Make the task write fault,
 396                  *      so that the watchpoint code notices the access.
 397                  */
 398             || db_watchpoint_list
 399 #endif  /* MACH_KDB */
 400             ) {
 401                 /*
 402                  *      If we aren't asking for write permission,
 403                  *      then don't give it away.  We're using write
 404                  *      faults to set the dirty bit.
 405                  */
 406                 if (!(fault_type & VM_PROT_WRITE))
 407                         *protection &= ~VM_PROT_WRITE;
 408         }
 409
 410         if (!vm_fault_interruptible)
 411                 interruptible = THREAD_UNINT;
 412 #else   /* STATIC_CONFIG */
 413 #if     MACH_KDB
 414                 /*
 415                  *      If there are watchpoints set, then
 416                  *      we don't want to give away write permission
 417                  *      on a read fault.  Make the task write fault,
 418                  *      so that the watchpoint code notices the access.
 419                  */
 420             if (db_watchpoint_list) {
 421                 /*
 422                  *      If we aren't asking for write permission,
 423                  *      then don't give it away.  We're using write
 424                  *      faults to set the dirty bit.
 425                  */
 426                 if (!(fault_type & VM_PROT_WRITE))
 427                         *protection &= ~VM_PROT_WRITE;
 428         }
 429
 430 #endif  /* MACH_KDB */
 431 #endif  /* STATIC_CONFIG */
 432
 433         cur_thread = current_thread();
 434
 435         interruptible_state = cur_thread->interruptible;
 436         if (interruptible == THREAD_UNINT)
 437                 cur_thread->interruptible = FALSE;
 438
 439         /*
 440          *      INVARIANTS (through entire routine):
 441          *
 442          *      1)      At all times, we must either have the object
 443          *              lock or a busy page in some object to prevent
 444          *              some other thread from trying to bring in
 445          *              the same page.
 446          *
 447          *              Note that we cannot hold any locks during the
 448          *              pager access or when waiting for memory, so
 449          *              we use a busy page then.
 450          *
 451          *              Note also that we aren't as concerned about more than
 452          *              one thread attempting to memory_object_data_unlock
 453          *              the same page at once, so we don't hold the page
 454          *              as busy then, but do record the highest unlock
 455          *              value so far.  [Unlock requests may also be delivered
 456          *              out of order.]
 457          *
 458          *      2)      To prevent another thread from racing us down the
 459          *              shadow chain and entering a new page in the top
 460          *              object before we do, we must keep a busy page in
 461          *              the top object while following the shadow chain.
 462          *
 463          *      3)      We must increment paging_in_progress on any object
 464          *              for which we have a busy page
 465          *
 466          *      4)      We leave busy pages on the pageout queues.
 467          *              If the pageout daemon comes across a busy page,
 468          *              it will remove the page from the pageout queues.
 469          */
 470
 471         /*
 472          *      Search for the page at object/offset.
 473          */
 474
 475         object = first_object;
 476         offset = first_offset;
 477         first_m = VM_PAGE_NULL;
 478         access_required = fault_type;
 479
 480         XPR(XPR_VM_FAULT,
 481                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 482                 (integer_t)object, offset, fault_type, *protection, 0);
 483
 484         /*
 485          *      See whether this page is resident
 486          */
 487
 488         while (TRUE) {
 489 #if TRACEFAULTPAGE
 490                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 491 #endif
 492                 if (!object->alive) {
 493                         vm_fault_cleanup(object, first_m);
 494                         cur_thread->interruptible = interruptible_state;
 495                         return(VM_FAULT_MEMORY_ERROR);
 496                 }
 497                 m = vm_page_lookup(object, offset);
 498 #if TRACEFAULTPAGE
 499                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 500 #endif
 501                 if (m != VM_PAGE_NULL) {
 502                         /*
 503                          *      If the page was pre-paged as part of a
 504                          *      cluster, record the fact.
 505                          */
 506                         if (m->clustered) {
 507                                 vm_pagein_cluster_used++;
 508                                 m->clustered = FALSE;
 509                         }
 510
 511                         /*
 512                          *      If the page is being brought in,
 513                          *      wait for it and then retry.
 514                          *
 515                          *      A possible optimization: if the page
 516                          *      is known to be resident, we can ignore
 517                          *      pages that are absent (regardless of
 518                          *      whether they're busy).
 519                          */
 520
 521                         if (m->busy) {
 522 #if TRACEFAULTPAGE
 523                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 524 #endif
 525                                 PAGE_ASSERT_WAIT(m, interruptible);
 526                                 vm_object_unlock(object);
 527                                 XPR(XPR_VM_FAULT,
 528                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 529                                         (integer_t)object, offset,
 530                                         (integer_t)m, 0, 0);
 531                                 counter(c_vm_fault_page_block_busy_kernel++);
 532                                 wait_result = thread_block((void (*)(void))0);
 533
 534                                 vm_object_lock(object);
 535                                 if (wait_result != THREAD_AWAKENED) {
 536                                         vm_fault_cleanup(object, first_m);
 537                                         cur_thread->interruptible = interruptible_state;
 538                                         if (wait_result == THREAD_RESTART)
 539                                           {
 540                                                 return(VM_FAULT_RETRY);
 541                                           }
 542                                         else
 543                                           {
 544                                                 return(VM_FAULT_INTERRUPTED);
 545                                           }
 546                                 }
 547                                 continue;
 548                         }
 549
 550                         /*
 551                          *      If the page is in error, give up now.
 552                          */
 553
 554                         if (m->error) {
 555 #if TRACEFAULTPAGE
 556                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
 557 #endif
 558                                 if (error_code)
 559                                         *error_code = m->page_error;
 560                                 VM_PAGE_FREE(m);
 561                                 vm_fault_cleanup(object, first_m);
 562                                 cur_thread->interruptible = interruptible_state;
 563                                 return(VM_FAULT_MEMORY_ERROR);
 564                         }
 565
 566                         /*
 567                          *      If the pager wants us to restart
 568                          *      at the top of the chain,
 569                          *      typically because it has moved the
 570                          *      page to another pager, then do so.
 571                          */
 572
 573                         if (m->restart) {
 574 #if TRACEFAULTPAGE
 575                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 576 #endif
 577                                 VM_PAGE_FREE(m);
 578                                 vm_fault_cleanup(object, first_m);
 579                                 cur_thread->interruptible = interruptible_state;
 580                                 return(VM_FAULT_RETRY);
 581                         }
 582
 583                         /*
 584                          *      If the page isn't busy, but is absent,
 585                          *      then it was deemed "unavailable".
 586                          */
 587
 588                         if (m->absent) {
 589                                 /*
 590                                  * Remove the non-existent page (unless it's
 591                                  * in the top object) and move on down to the
 592                                  * next object (if there is one).
 593                                  */
 594 #if TRACEFAULTPAGE
 595                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
 596 #endif
 597
 598                                 next_object = object->shadow;
 599                                 if (next_object == VM_OBJECT_NULL) {
 600                                         vm_page_t real_m;
 601
 602                                         assert(!must_be_resident);
 603
 604                                         if (object->shadow_severed) {
 605                                                 vm_fault_cleanup(
 606                                                         object, first_m);
 607                                                 cur_thread->interruptible = interruptible_state;
 608                                                 return VM_FAULT_MEMORY_ERROR;
 609                                         }
 610
 611                                         /*
 612                                          * Absent page at bottom of shadow
 613                                          * chain; zero fill the page we left
 614                                          * busy in the first object, and flush
 615                                          * the absent page.  But first we
 616                                          * need to allocate a real page.
 617                                          */
 618                                         if (VM_PAGE_THROTTLED() ||
 619                                             (real_m = vm_page_grab()) == VM_PAGE_NULL) {
 620                                                 vm_fault_cleanup(object, first_m);
 621                                                 cur_thread->interruptible = interruptible_state;
 622                                                 return(VM_FAULT_MEMORY_SHORTAGE);
 623                                         }
 624
 625                                         XPR(XPR_VM_FAULT,
 626               "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
 627                                                 (integer_t)object, offset,
 628                                                 (integer_t)m,
 629                                                 (integer_t)first_object, 0);
 630                                         if (object != first_object) {
 631                                                 VM_PAGE_FREE(m);
 632                                                 vm_object_paging_end(object);
 633                                                 vm_object_unlock(object);
 634                                                 object = first_object;
 635                                                 offset = first_offset;
 636                                                 m = first_m;
 637                                                 first_m = VM_PAGE_NULL;
 638                                                 vm_object_lock(object);
 639                                         }
 640
 641                                         VM_PAGE_FREE(m);
 642                                         assert(real_m->busy);
 643                                         vm_page_insert(real_m, object, offset);
 644                                         m = real_m;
 645
 646                                         /*
 647                                          *  Drop the lock while zero filling
 648                                          *  page.  Then break because this
 649                                          *  is the page we wanted.  Checking
 650                                          *  the page lock is a waste of time;
 651                                          *  this page was either absent or
 652                                          *  newly allocated -- in both cases
 653                                          *  it can't be page locked by a pager.
 654                                          */
 655                                         m->no_isync = FALSE;
 656
 657                                         if (!no_zero_fill) {
 658                                                 vm_object_unlock(object);
 659                                                 vm_page_zero_fill(m);
 660                                                 if (type_of_fault)
 661                                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
 662                                                 VM_STAT(zero_fill_count++);
 663
 664                                                 if (bumped_pagein == TRUE) {
 665                                                         VM_STAT(pageins--);
 666                                                         current_task()->pageins--;
 667                                                 }
 668                                                 vm_object_lock(object);
 669                                         }
 670                                         pmap_clear_modify(m->phys_addr);
 671                                         vm_page_lock_queues();
 672                                         VM_PAGE_QUEUES_REMOVE(m);
 673                                         m->page_ticket = vm_page_ticket;
 674                                         vm_page_ticket_roll++;
 675                                         if(vm_page_ticket_roll ==
 676                                                 VM_PAGE_TICKETS_IN_ROLL) {
 677                                                 vm_page_ticket_roll = 0;
 678                                                 if(vm_page_ticket ==
 679                                                      VM_PAGE_TICKET_ROLL_IDS)
 680                                                         vm_page_ticket= 0;
 681                                                 else
 682                                                         vm_page_ticket++;
 683                                         }
 684                                         queue_enter(&vm_page_queue_inactive,
 685                                                         m, vm_page_t, pageq);
 686                                         m->inactive = TRUE;
 687                                         vm_page_inactive_count++;
 688                                         vm_page_unlock_queues();
 689                                         break;
 690                                 } else {
 691                                         if (must_be_resident) {
 692                                                 vm_object_paging_end(object);
 693                                         } else if (object != first_object) {
 694                                                 vm_object_paging_end(object);
 695                                                 VM_PAGE_FREE(m);
 696                                         } else {
 697                                                 first_m = m;
 698                                                 m->absent = FALSE;
 699                                                 m->unusual = FALSE;
 700                                                 vm_object_absent_release(object);
 701                                                 m->busy = TRUE;
 702
 703                                                 vm_page_lock_queues();
 704                                                 VM_PAGE_QUEUES_REMOVE(m);
 705                                                 vm_page_unlock_queues();
 706                                         }
 707                                         XPR(XPR_VM_FAULT,
 708                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
 709                                                 (integer_t)object, offset,
 710                                                 (integer_t)next_object,
 711                                                 offset+object->shadow_offset,0);
 712                                         offset += object->shadow_offset;
 713                                         hi_offset += object->shadow_offset;
 714                                         lo_offset += object->shadow_offset;
 715                                         access_required = VM_PROT_READ;
 716                                         vm_object_lock(next_object);
 717                                         vm_object_unlock(object);
 718                                         object = next_object;
 719                                         vm_object_paging_begin(object);
 720                                         continue;
 721                                 }
 722                         }
 723
 724                         if ((m->cleaning)
 725                                 && ((object != first_object) ||
 726                                     (object->copy != VM_OBJECT_NULL))
 727                                 && (fault_type & VM_PROT_WRITE)) {
 728                                 /*
 729                                  * This is a copy-on-write fault that will
 730                                  * cause us to revoke access to this page, but
 731                                  * this page is in the process of being cleaned
 732                                  * in a clustered pageout. We must wait until
 733                                  * the cleaning operation completes before
 734                                  * revoking access to the original page,
 735                                  * otherwise we might attempt to remove a
 736                                  * wired mapping.
 737                                  */
 738 #if TRACEFAULTPAGE
 739                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
 740 #endif
 741                                 XPR(XPR_VM_FAULT,
 742                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
 743                                         (integer_t)object, offset,
 744                                         (integer_t)m, 0, 0);
 745                                 /* take an extra ref so that object won't die */
 746                                 assert(object->ref_count > 0);
 747                                 object->ref_count++;
 748                                 vm_object_res_reference(object);
 749                                 vm_fault_cleanup(object, first_m);
 750                                 counter(c_vm_fault_page_block_backoff_kernel++);
 751                                 vm_object_lock(object);
 752                                 assert(object->ref_count > 0);
 753                                 m = vm_page_lookup(object, offset);
 754                                 if (m != VM_PAGE_NULL && m->cleaning) {
 755                                         PAGE_ASSERT_WAIT(m, interruptible);
 756                                         vm_object_unlock(object);
 757                                         wait_result = thread_block((void (*)(void)) 0);
 758                                         vm_object_deallocate(object);
 759                                         goto backoff;
 760                                 } else {
 761                                         vm_object_unlock(object);
 762                                         vm_object_deallocate(object);
 763                                         cur_thread->interruptible = interruptible_state;
 764                                         return VM_FAULT_RETRY;
 765                                 }
 766                         }
 767
 768                         /*
 769                          *      If the desired access to this page has
 770                          *      been locked out, request that it be unlocked.
 771                          */
 772
 773                         if (access_required & m->page_lock) {
 774                                 if ((access_required & m->unlock_request) != access_required) {
 775                                         vm_prot_t       new_unlock_request;
 776                                         kern_return_t   rc;
 777
 778 #if TRACEFAULTPAGE
 779                                         dbgTrace(0xBEEF000A, (unsigned int) m, (unsigned int) object->pager_ready);     /* (TEST/DEBUG) */
 780 #endif
 781                                         if (!object->pager_ready) {
 782                                         XPR(XPR_VM_FAULT,
 783                                             "vm_f_page: ready wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 784                                                 access_required,
 785                                                 (integer_t)object, offset,
 786                                                 (integer_t)m, 0);
 787                                                 /* take an extra ref */
 788                                                 assert(object->ref_count > 0);
 789                                                 object->ref_count++;
 790                                                 vm_object_res_reference(object);
 791                                                 vm_fault_cleanup(object,
 792                                                                  first_m);
 793                                                 counter(c_vm_fault_page_block_backoff_kernel++);
 794                                                 vm_object_lock(object);
 795                                                 assert(object->ref_count > 0);
 796                                                 if (!object->pager_ready) {
 797                                                         vm_object_assert_wait(
 798                                                                 object,
 799                                                                 VM_OBJECT_EVENT_PAGER_READY,
 800                                                                 interruptible);
 801                                                         vm_object_unlock(object);
 802                                                         wait_result = thread_block((void (*)(void))0);
 803                                                         vm_object_deallocate(object);
 804                                                         goto backoff;
 805                                                 } else {
 806                                                         vm_object_unlock(object);
 807                                                         vm_object_deallocate(object);
 808                                                         cur_thread->interruptible = interruptible_state;
 809                                                         return VM_FAULT_RETRY;
 810                                                 }
 811                                         }
 812
 813                                         new_unlock_request = m->unlock_request =
 814                                                 (access_required | m->unlock_request);
 815                                         vm_object_unlock(object);
 816                                         XPR(XPR_VM_FAULT,
 817                                             "vm_f_page: unlock obj 0x%X, offset 0x%X, page 0x%X, unl_req %d\n",
 818                                         (integer_t)object, offset,
 819                                         (integer_t)m, new_unlock_request, 0);
 820                                         if ((rc = memory_object_data_unlock(
 821                                                 object->pager,
 822                                                 offset + object->paging_offset,
 823                                                 PAGE_SIZE,
 824                                                 new_unlock_request))
 825                                              != KERN_SUCCESS) {
 826                                                 if (vm_fault_debug)
 827                                                     printf("vm_fault: memory_object_data_unlock failed\n");
 828                                                 vm_object_lock(object);
 829                                                 vm_fault_cleanup(object, first_m);
 830                                                 cur_thread->interruptible = interruptible_state;
 831                                                 return((rc == MACH_SEND_INTERRUPTED) ?
 832                                                         VM_FAULT_INTERRUPTED :
 833                                                         VM_FAULT_MEMORY_ERROR);
 834                                         }
 835                                         vm_object_lock(object);
 836                                         continue;
 837                                 }
 838
 839                                 XPR(XPR_VM_FAULT,
 840         "vm_f_page: access wait acc_req %d, obj 0x%X, offset 0x%X, page 0x%X\n",
 841                                         access_required, (integer_t)object,
 842                                         offset, (integer_t)m, 0);
 843                                 /* take an extra ref so object won't die */
 844                                 assert(object->ref_count > 0);
 845                                 object->ref_count++;
 846                                 vm_object_res_reference(object);
 847                                 vm_fault_cleanup(object, first_m);
 848                                 counter(c_vm_fault_page_block_backoff_kernel++);
 849                                 vm_object_lock(object);
 850                                 assert(object->ref_count > 0);
 851                                 m = vm_page_lookup(object, offset);
 852                                 if (m != VM_PAGE_NULL &&
 853                                     (access_required & m->page_lock) &&
 854                                     !((access_required & m->unlock_request) != access_required)) {
 855                                         PAGE_ASSERT_WAIT(m, interruptible);
 856                                         vm_object_unlock(object);
 857                                         wait_result = thread_block((void (*)(void)) 0);
 858                                         vm_object_deallocate(object);
 859                                         goto backoff;
 860                                 } else {
 861                                         vm_object_unlock(object);
 862                                         vm_object_deallocate(object);
 863                                         cur_thread->interruptible = interruptible_state;
 864                                         return VM_FAULT_RETRY;
 865                                 }
 866                         }
 867                         /*
 868                          *      We mark the page busy and leave it on
 869                          *      the pageout queues.  If the pageout
 870                          *      deamon comes across it, then it will
 871                          *      remove the page.
 872                          */
 873
 874 #if TRACEFAULTPAGE
 875                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 876 #endif
 877
 878 #if     !VM_FAULT_STATIC_CONFIG
 879                         if (!software_reference_bits) {
 880                                 vm_page_lock_queues();
 881                                 if (m->inactive)
 882                                         vm_stat.reactivations++;
 883
 884                                 VM_PAGE_QUEUES_REMOVE(m);
 885                                 vm_page_unlock_queues();
 886                         }
 887 #endif
 888                         XPR(XPR_VM_FAULT,
 889                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
 890                                 (integer_t)object, offset, (integer_t)m, 0, 0);
 891                         assert(!m->busy);
 892                         m->busy = TRUE;
 893                         assert(!m->absent);
 894                         break;
 895                 }
 896
 897                 look_for_page =
 898                         (object->pager_created) &&
 899                           LOOK_FOR(object, offset) &&
 900                             (!data_supply);
 901
 902 #if TRACEFAULTPAGE
 903                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
 904 #endif
 905                 if ((look_for_page || (object == first_object))
 906                                 && !must_be_resident
 907                                 && !(object->phys_contiguous))  {
 908                         /*
 909                          *      Allocate a new page for this object/offset
 910                          *      pair.
 911                          */
 912
 913                         m = vm_page_grab_fictitious();
 914 #if TRACEFAULTPAGE
 915                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 916 #endif
 917                         if (m == VM_PAGE_NULL) {
 918                                 vm_fault_cleanup(object, first_m);
 919                                 cur_thread->interruptible = interruptible_state;
 920                                 return(VM_FAULT_FICTITIOUS_SHORTAGE);
 921                         }
 922                         vm_page_insert(m, object, offset);
 923                 }
 924
 925                 if ((look_for_page && !must_be_resident)) {
 926                         kern_return_t   rc;
 927
 928                         /*
 929                          *      If the memory manager is not ready, we
 930                          *      cannot make requests.
 931                          */
 932                         if (!object->pager_ready) {
 933 #if TRACEFAULTPAGE
 934                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 935 #endif
 936                                 if(m != VM_PAGE_NULL)
 937                                         VM_PAGE_FREE(m);
 938                                 XPR(XPR_VM_FAULT,
 939                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
 940                                         (integer_t)object, offset, 0, 0, 0);
 941                                 /* take an extra ref so object won't die */
 942                                 assert(object->ref_count > 0);
 943                                 object->ref_count++;
 944                                 vm_object_res_reference(object);
 945                                 vm_fault_cleanup(object, first_m);
 946                                 counter(c_vm_fault_page_block_backoff_kernel++);
 947                                 vm_object_lock(object);
 948                                 assert(object->ref_count > 0);
 949                                 if (!object->pager_ready) {
 950                                         vm_object_assert_wait(object,
 951                                                               VM_OBJECT_EVENT_PAGER_READY,
 952                                                               interruptible);
 953                                         vm_object_unlock(object);
 954                                         wait_result = thread_block((void (*)(void))0);
 955                                         vm_object_deallocate(object);
 956                                         goto backoff;
 957                                 } else {
 958                                         vm_object_unlock(object);
 959                                         vm_object_deallocate(object);
 960                                         cur_thread->interruptible = interruptible_state;
 961                                         return VM_FAULT_RETRY;
 962                                 }
 963                         }
 964
 965                         if(object->phys_contiguous) {
 966                                 if(m != VM_PAGE_NULL) {
 967                                         VM_PAGE_FREE(m);
 968                                         m = VM_PAGE_NULL;
 969                                 }
 970                                 goto no_clustering;
 971                         }
 972                         if (object->internal) {
 973                                 /*
 974                                  *      Requests to the default pager
 975                                  *      must reserve a real page in advance,
 976                                  *      because the pager's data-provided
 977                                  *      won't block for pages.  IMPORTANT:
 978                                  *      this acts as a throttling mechanism
 979                                  *      for data_requests to the default
 980                                  *      pager.
 981                                  */
 982
 983 #if TRACEFAULTPAGE
 984                                 dbgTrace(0xBEEF000F, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 985 #endif
 986                                 if (m->fictitious && !vm_page_convert(m)) {
 987                                         VM_PAGE_FREE(m);
 988                                         vm_fault_cleanup(object, first_m);
 989                                         cur_thread->interruptible = interruptible_state;
 990                                         return(VM_FAULT_MEMORY_SHORTAGE);
 991                                 }
 992                         } else if (object->absent_count >
 993                                                 vm_object_absent_max) {
 994                                 /*
 995                                  *      If there are too many outstanding page
 996                                  *      requests pending on this object, we
 997                                  *      wait for them to be resolved now.
 998                                  */
 999
1000 #if TRACEFAULTPAGE
1001                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1002 #endif
1003                                 if(m != VM_PAGE_NULL)
1004                                         VM_PAGE_FREE(m);
1005                                 /* take an extra ref so object won't die */
1006                                 assert(object->ref_count > 0);
1007                                 object->ref_count++;
1008                                 vm_object_res_reference(object);
1009                                 vm_fault_cleanup(object, first_m);
1010                                 counter(c_vm_fault_page_block_backoff_kernel++);
1011                                 vm_object_lock(object);
1012                                 assert(object->ref_count > 0);
1013                                 if (object->absent_count > vm_object_absent_max) {
1014                                         vm_object_absent_assert_wait(object,
1015                                                                      interruptible);
1016                                         vm_object_unlock(object);
1017                                         wait_result = thread_block((void (*)(void))0);
1018                                         vm_object_deallocate(object);
1019                                         goto backoff;
1020                                 } else {
1021                                         vm_object_unlock(object);
1022                                         vm_object_deallocate(object);
1023                                         cur_thread->interruptible = interruptible_state;
1024                                         return VM_FAULT_RETRY;
1025                                 }
1026                         }
1027
1028                         /*
1029                          *      Indicate that the page is waiting for data
1030                          *      from the memory manager.
1031                          */
1032
1033                         if(m != VM_PAGE_NULL) {
1034
1035                                 m->list_req_pending = TRUE;
1036                                 m->absent = TRUE;
1037                                 m->unusual = TRUE;
1038                                 object->absent_count++;
1039
1040                         }
1041
1042                         cluster_start = offset;
1043                         length = PAGE_SIZE;
1044                         cluster_size = object->cluster_size;
1045
1046                         /*
1047                          * Skip clustered pagein if it is globally disabled
1048                          * or random page reference behavior is expected
1049                          * for the address range containing the faulting
1050                          * address or the object paging block size is
1051                          * equal to the page size.
1052                          */
1053                         if (!vm_allow_clustered_pagein ||
1054                              behavior == VM_BEHAVIOR_RANDOM ||
1055                              m == VM_PAGE_NULL ||
1056                              cluster_size == PAGE_SIZE) {
1057                                 cluster_start = trunc_page_64(cluster_start);
1058                                 goto no_clustering;
1059                         }
1060
1061                         assert(offset >= lo_offset);
1062                         assert(offset < hi_offset);
1063                         assert(ALIGNED(object->paging_offset));
1064                         assert(cluster_size >= PAGE_SIZE);
1065
1066 #if TRACEFAULTPAGE
1067                         dbgTrace(0xBEEF0011, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1068 #endif
1069                         /*
1070                          * Decide whether to scan ahead or behind for
1071                          * additional pages contiguous to the faulted
1072                          * page in the same paging block.  The decision
1073                          * is based on system wide globals and the
1074                          * expected page reference behavior of the
1075                          * address range contained the faulting address.
1076                          * First calculate some constants.
1077                          */
1078                         paging_offset = offset + object->paging_offset;
1079                         cluster_offset = paging_offset & (cluster_size - 1);
1080                         align_offset = paging_offset&(PAGE_SIZE_64-1);
1081                         if (align_offset != 0) {
1082                                 cluster_offset = trunc_page_64(cluster_offset);
1083                         }
1084
1085 #define SPANS_CLUSTER(x) ((((x) - align_offset) & (vm_object_offset_t)(cluster_size - 1)) == 0)
1086
1087                         /*
1088                          * Backward scan only if reverse sequential
1089                          * behavior has been specified
1090                          */
1091                         CLUSTER_STAT(pages_at_lower_offsets = 0;)
1092                         if (((vm_default_behind != 0 &&
1093                              behavior == VM_BEHAVIOR_DEFAULT) ||
1094                              behavior == VM_BEHAVIOR_RSEQNTL) && offset) {
1095                             vm_object_offset_t cluster_bot;
1096
1097                             /*
1098                              * Calculate lower search boundary.
1099                              * Exclude pages that span a cluster boundary.
1100                              * Clip to start of map entry.
1101                              * For default page reference behavior, scan
1102                              * default pages behind.
1103                              */
1104                             cluster_bot = (offset > cluster_offset) ?
1105                                             offset - cluster_offset : offset;
1106                             if (align_offset != 0) {
1107                                 if ((cluster_bot < offset) &&
1108                                     SPANS_CLUSTER(cluster_bot)) {
1109                                         cluster_bot += PAGE_SIZE_64;
1110                                 }
1111                             }
1112                             if (behavior == VM_BEHAVIOR_DEFAULT) {
1113                                 vm_object_offset_t
1114                                         bot = (vm_object_offset_t)
1115                                                 (vm_default_behind * PAGE_SIZE);
1116
1117                                 if (cluster_bot < (offset - bot))
1118                                         cluster_bot = offset - bot;
1119                             }
1120                             if (lo_offset > cluster_bot)
1121                                 cluster_bot = lo_offset;
1122
1123                             for ( cluster_start = offset - PAGE_SIZE_64;
1124                                  (cluster_start >= cluster_bot) &&
1125                                  (cluster_start !=
1126                                         (align_offset - PAGE_SIZE_64));
1127                                   cluster_start -= PAGE_SIZE_64) {
1128                                 assert(cluster_size > PAGE_SIZE_64);
1129 retry_cluster_backw:
1130                                 if (!LOOK_FOR(object, cluster_start) ||
1131                                     vm_page_lookup(object, cluster_start)
1132                                                 != VM_PAGE_NULL) {
1133                                         break;
1134                                 }
1135                                 if (object->internal) {
1136                                         /*
1137                                          * need to acquire a real page in
1138                                          * advance because this acts as
1139                                          * a throttling mechanism for
1140                                          * data_requests to the default
1141                                          * pager.  If this fails, give up
1142                                          * trying to find any more pages
1143                                          * in the cluster and send off the
1144                                          * request for what we already have.
1145                                          */
1146                                         if ((m = vm_page_grab())
1147                                                         == VM_PAGE_NULL) {
1148                                             cluster_start += PAGE_SIZE_64;
1149                                             cluster_end = offset + PAGE_SIZE_64;
1150                                             goto give_up;
1151                                         }
1152                                 } else if ((m = vm_page_grab_fictitious())
1153                                                 == VM_PAGE_NULL) {
1154                                         vm_object_unlock(object);
1155                                         vm_page_more_fictitious();
1156                                         vm_object_lock(object);
1157                                         goto retry_cluster_backw;
1158                                 }
1159                                 m->absent = TRUE;
1160                                 m->unusual = TRUE;
1161                                 m->clustered = TRUE;
1162                                 m->list_req_pending = TRUE;
1163
1164                                 vm_page_insert(m, object, cluster_start);
1165                                 CLUSTER_STAT(pages_at_lower_offsets++;)
1166                                 object->absent_count++;
1167                             }
1168                             cluster_start += PAGE_SIZE_64;
1169                             assert(cluster_start >= cluster_bot);
1170                         }
1171                         assert(cluster_start <= offset);
1172
1173                         /*
1174                          * Forward scan if default or sequential behavior
1175                          * specified
1176                          */
1177                         CLUSTER_STAT(pages_at_higher_offsets = 0;)
1178                         if ((behavior == VM_BEHAVIOR_DEFAULT &&
1179                              vm_default_ahead != 0) ||
1180                              behavior == VM_BEHAVIOR_SEQUENTIAL) {
1181                             vm_object_offset_t cluster_top;
1182
1183                             /*
1184                              * Calculate upper search boundary.
1185                              * Exclude pages that span a cluster boundary.
1186                              * Clip to end of map entry.
1187                              * For default page reference behavior, scan
1188                              * default pages ahead.
1189                              */
1190                             cluster_top = (offset + cluster_size) -
1191                                           cluster_offset;
1192                             if (align_offset != 0) {
1193                                 if ((cluster_top > (offset + PAGE_SIZE_64)) &&
1194                                     SPANS_CLUSTER(cluster_top)) {
1195                                         cluster_top -= PAGE_SIZE_64;
1196                                 }
1197                             }
1198                             if (behavior == VM_BEHAVIOR_DEFAULT) {
1199                                 vm_object_offset_t top = (vm_object_offset_t)
1200                                      ((vm_default_ahead*PAGE_SIZE)+PAGE_SIZE);
1201
1202                                 if (cluster_top > (offset + top))
1203                                         cluster_top =  offset + top;
1204                             }
1205                             if (cluster_top > hi_offset)
1206                                         cluster_top = hi_offset;
1207
1208                             for (cluster_end = offset + PAGE_SIZE_64;
1209                                  cluster_end < cluster_top;
1210                                  cluster_end += PAGE_SIZE_64) {
1211                                 assert(cluster_size > PAGE_SIZE);
1212 retry_cluster_forw:
1213                                 if (!LOOK_FOR(object, cluster_end) ||
1214                                     vm_page_lookup(object, cluster_end)
1215                                                 != VM_PAGE_NULL) {
1216                                         break;
1217                                 }
1218                                 if (object->internal) {
1219                                         /*
1220                                          * need to acquire a real page in
1221                                          * advance because this acts as
1222                                          * a throttling mechanism for
1223                                          * data_requests to the default
1224                                          * pager.  If this fails, give up
1225                                          * trying to find any more pages
1226                                          * in the cluster and send off the
1227                                          * request for what we already have.
1228                                          */
1229                                         if ((m = vm_page_grab())
1230                                                         == VM_PAGE_NULL) {
1231                                             break;
1232                                         }
1233                                 } else if ((m = vm_page_grab_fictitious())
1234                                                 == VM_PAGE_NULL) {
1235                                     vm_object_unlock(object);
1236                                     vm_page_more_fictitious();
1237                                     vm_object_lock(object);
1238                                     goto retry_cluster_forw;
1239                                 }
1240                                 m->absent = TRUE;
1241                                 m->unusual = TRUE;
1242                                 m->clustered = TRUE;
1243                                 m->list_req_pending = TRUE;
1244
1245                                 vm_page_insert(m, object, cluster_end);
1246                                 CLUSTER_STAT(pages_at_higher_offsets++;)
1247                                 object->absent_count++;
1248                             }
1249                             assert(cluster_end <= cluster_top);
1250                         }
1251                         else {
1252                                 cluster_end = offset + PAGE_SIZE_64;
1253                         }
1254 give_up:
1255                         assert(cluster_end >= offset + PAGE_SIZE_64);
1256                         length = cluster_end - cluster_start;
1257
1258 #if     MACH_CLUSTER_STATS
1259                         CLUSTER_STAT_HIGHER(pages_at_higher_offsets);
1260                         CLUSTER_STAT_LOWER(pages_at_lower_offsets);
1261                         CLUSTER_STAT_CLUSTER(length/PAGE_SIZE);
1262 #endif  /* MACH_CLUSTER_STATS */
1263
1264 no_clustering:
1265                         /*
1266                          * lengthen the cluster by the pages in the working set
1267                          */
1268                         if((map != NULL) &&
1269                                 (current_task()->dynamic_working_set != 0)) {
1270                                 cluster_end = cluster_start + length;
1271                                 /* tws values for start and end are just a
1272                                  * suggestions.  Therefore, as long as
1273                                  * build_cluster does not use pointers or
1274                                  * take action based on values that
1275                                  * could be affected by re-entrance we
1276                                  * do not need to take the map lock.
1277                                  */
1278                                 tws_build_cluster((tws_hash_t)
1279                                         current_task()->dynamic_working_set,
1280                                         object, &cluster_start,
1281                                         &cluster_end, 0x16000);
1282                                 length = cluster_end - cluster_start;
1283                         }
1284 #if TRACEFAULTPAGE
1285                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1286 #endif
1287                         /*
1288                          *      We have a busy page, so we can
1289                          *      release the object lock.
1290                          */
1291                         vm_object_unlock(object);
1292
1293                         /*
1294                          *      Call the memory manager to retrieve the data.
1295                          */
1296
1297                         if (type_of_fault)
1298                                 *type_of_fault = DBG_PAGEIN_FAULT;
1299                         VM_STAT(pageins++);
1300                         current_task()->pageins++;
1301                         bumped_pagein = TRUE;
1302
1303                         /*
1304                          *      If this object uses a copy_call strategy,
1305                          *      and we are interested in a copy of this object
1306                          *      (having gotten here only by following a
1307                          *      shadow chain), then tell the memory manager
1308                          *      via a flag added to the desired_access
1309                          *      parameter, so that it can detect a race
1310                          *      between our walking down the shadow chain
1311                          *      and its pushing pages up into a copy of
1312                          *      the object that it manages.
1313                          */
1314
1315                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL &&
1316                             object != first_object) {
1317                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1318                         } else {
1319                                 wants_copy_flag = VM_PROT_NONE;
1320                         }
1321
1322                         XPR(XPR_VM_FAULT,
1323                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1324                                 (integer_t)object, offset, (integer_t)m,
1325                                 access_required | wants_copy_flag, 0);
1326
1327                         rc = memory_object_data_request(object->pager,
1328                                         cluster_start + object->paging_offset,
1329                                         length,
1330                                         access_required | wants_copy_flag);
1331
1332
1333 #if TRACEFAULTPAGE
1334                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1335 #endif
1336                         if (rc != KERN_SUCCESS) {
1337                                 if (rc != MACH_SEND_INTERRUPTED
1338                                     && vm_fault_debug)
1339                                         printf("%s(0x%x, 0x%x, 0x%x, 0x%x) failed, rc=%d\n",
1340                                                 "memory_object_data_request",
1341                                                 object->pager,
1342                                                 cluster_start + object->paging_offset,
1343                                                 length, access_required, rc);
1344                                 /*
1345                                  *      Don't want to leave a busy page around,
1346                                  *      but the data request may have blocked,
1347                                  *      so check if it's still there and busy.
1348                                  */
1349                                 if(!object->phys_contiguous) {
1350                                    vm_object_lock(object);
1351                                    for (; length; length -= PAGE_SIZE,
1352                                       cluster_start += PAGE_SIZE_64) {
1353                                       vm_page_t p;
1354                                       if ((p = vm_page_lookup(object,
1355                                                                 cluster_start))
1356                                             && p->absent && p->busy
1357                                             && p != first_m) {
1358                                          VM_PAGE_FREE(p);
1359                                       }
1360                                    }
1361                                 }
1362                                 vm_fault_cleanup(object, first_m);
1363                                 cur_thread->interruptible = interruptible_state;
1364                                 return((rc == MACH_SEND_INTERRUPTED) ?
1365                                         VM_FAULT_INTERRUPTED :
1366                                         VM_FAULT_MEMORY_ERROR);
1367                         } else {
1368 #ifdef notdefcdy
1369                                 tws_hash_line_t line;
1370                                 task_t          task;
1371
1372                                 task = current_task();
1373
1374                                 if((map != NULL) &&
1375                                         (task->dynamic_working_set != 0)) {
1376                                         if(tws_lookup
1377                                                 ((tws_hash_t)
1378                                                 task->dynamic_working_set,
1379                                                 offset, object,
1380                                                 &line) == KERN_SUCCESS) {
1381                                                 tws_line_signal((tws_hash_t)
1382                                                 task->dynamic_working_set,
1383                                                         map, line, vaddr);
1384                                         }
1385                                 }
1386 #endif
1387                         }
1388
1389                         /*
1390                          * Retry with same object/offset, since new data may
1391                          * be in a different page (i.e., m is meaningless at
1392                          * this point).
1393                          */
1394                         vm_object_lock(object);
1395                         if ((interruptible != THREAD_UNINT) &&
1396                             (current_thread()->state & TH_ABORT)) {
1397                                 vm_fault_cleanup(object, first_m);
1398                                 cur_thread->interruptible = interruptible_state;
1399                                 return(VM_FAULT_INTERRUPTED);
1400                         }
1401                         if(m == VM_PAGE_NULL)
1402                                 break;
1403                         continue;
1404                 }
1405
1406                 /*
1407                  * The only case in which we get here is if
1408                  * object has no pager (or unwiring).  If the pager doesn't
1409                  * have the page this is handled in the m->absent case above
1410                  * (and if you change things here you should look above).
1411                  */
1412 #if TRACEFAULTPAGE
1413                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1414 #endif
1415                 if (object == first_object)
1416                         first_m = m;
1417                 else
1418                         assert(m == VM_PAGE_NULL);
1419
1420                 XPR(XPR_VM_FAULT,
1421                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1422                         (integer_t)object, offset, (integer_t)m,
1423                         (integer_t)object->shadow, 0);
1424                 /*
1425                  *      Move on to the next object.  Lock the next
1426                  *      object before unlocking the current one.
1427                  */
1428                 next_object = object->shadow;
1429                 if (next_object == VM_OBJECT_NULL) {
1430                         assert(!must_be_resident);
1431                         /*
1432                          *      If there's no object left, fill the page
1433                          *      in the top object with zeros.  But first we
1434                          *      need to allocate a real page.
1435                          */
1436
1437                         if (object != first_object) {
1438                                 vm_object_paging_end(object);
1439                                 vm_object_unlock(object);
1440
1441                                 object = first_object;
1442                                 offset = first_offset;
1443                                 vm_object_lock(object);
1444                         }
1445
1446                         m = first_m;
1447                         assert(m->object == object);
1448                         first_m = VM_PAGE_NULL;
1449
1450                         if (object->shadow_severed) {
1451                                 VM_PAGE_FREE(m);
1452                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1453                                 cur_thread->interruptible = interruptible_state;
1454                                 return VM_FAULT_MEMORY_ERROR;
1455                         }
1456
1457                         if (VM_PAGE_THROTTLED() ||
1458                             (m->fictitious && !vm_page_convert(m))) {
1459                                 VM_PAGE_FREE(m);
1460                                 vm_fault_cleanup(object, VM_PAGE_NULL);
1461                                 cur_thread->interruptible = interruptible_state;
1462                                 return(VM_FAULT_MEMORY_SHORTAGE);
1463                         }
1464                         m->no_isync = FALSE;
1465
1466                         if (!no_zero_fill) {
1467                                 vm_object_unlock(object);
1468                                 vm_page_zero_fill(m);
1469                                 if (type_of_fault)
1470                                         *type_of_fault = DBG_ZERO_FILL_FAULT;
1471                                 VM_STAT(zero_fill_count++);
1472
1473                                 if (bumped_pagein == TRUE) {
1474                                         VM_STAT(pageins--);
1475                                         current_task()->pageins--;
1476                                 }
1477                                 vm_object_lock(object);
1478                         }
1479                         vm_page_lock_queues();
1480                         VM_PAGE_QUEUES_REMOVE(m);
1481                         m->page_ticket = vm_page_ticket;
1482                         vm_page_ticket_roll++;
1483                         if(vm_page_ticket_roll == VM_PAGE_TICKETS_IN_ROLL) {
1484                                 vm_page_ticket_roll = 0;
1485                                 if(vm_page_ticket ==
1486                                         VM_PAGE_TICKET_ROLL_IDS)
1487                                         vm_page_ticket= 0;
1488                                 else
1489                                         vm_page_ticket++;
1490                         }
1491                         queue_enter(&vm_page_queue_inactive,
1492                                                 m, vm_page_t, pageq);
1493                         m->inactive = TRUE;
1494                         vm_page_inactive_count++;
1495                         vm_page_unlock_queues();
1496                         pmap_clear_modify(m->phys_addr);
1497                         break;
1498                 }
1499                 else {
1500                         if ((object != first_object) || must_be_resident)
1501                                 vm_object_paging_end(object);
1502                         offset += object->shadow_offset;
1503                         hi_offset += object->shadow_offset;
1504                         lo_offset += object->shadow_offset;
1505                         access_required = VM_PROT_READ;
1506                         vm_object_lock(next_object);
1507                         vm_object_unlock(object);
1508                         object = next_object;
1509                         vm_object_paging_begin(object);
1510                 }
1511         }
1512
1513         /*
1514          *      PAGE HAS BEEN FOUND.
1515          *
1516          *      This page (m) is:
1517          *              busy, so that we can play with it;
1518          *              not absent, so that nobody else will fill it;
1519          *              possibly eligible for pageout;
1520          *
1521          *      The top-level page (first_m) is:
1522          *              VM_PAGE_NULL if the page was found in the
1523          *               top-level object;
1524          *              busy, not absent, and ineligible for pageout.
1525          *
1526          *      The current object (object) is locked.  A paging
1527          *      reference is held for the current and top-level
1528          *      objects.
1529          */
1530
1531 #if TRACEFAULTPAGE
1532         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1533 #endif
1534 #if     EXTRA_ASSERTIONS
1535         if(m != VM_PAGE_NULL) {
1536                 assert(m->busy && !m->absent);
1537                 assert((first_m == VM_PAGE_NULL) ||
1538                         (first_m->busy && !first_m->absent &&
1539                          !first_m->active && !first_m->inactive));
1540         }
1541 #endif  /* EXTRA_ASSERTIONS */
1542
1543         XPR(XPR_VM_FAULT,
1544        "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1545                 (integer_t)object, offset, (integer_t)m,
1546                 (integer_t)first_object, (integer_t)first_m);
1547         /*
1548          *      If the page is being written, but isn't
1549          *      already owned by the top-level object,
1550          *      we have to copy it into a new page owned
1551          *      by the top-level object.
1552          */
1553
1554         if ((object != first_object) && (m != VM_PAGE_NULL)) {
1555                 /*
1556                  *      We only really need to copy if we
1557                  *      want to write it.
1558                  */
1559
1560 #if TRACEFAULTPAGE
1561                         dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1562 #endif
1563                 if (fault_type & VM_PROT_WRITE) {
1564                         vm_page_t copy_m;
1565
1566                         assert(!must_be_resident);
1567
1568                         /*
1569                          *      If we try to collapse first_object at this
1570                          *      point, we may deadlock when we try to get
1571                          *      the lock on an intermediate object (since we
1572                          *      have the bottom object locked).  We can't
1573                          *      unlock the bottom object, because the page
1574                          *      we found may move (by collapse) if we do.
1575                          *
1576                          *      Instead, we first copy the page.  Then, when
1577                          *      we have no more use for the bottom object,
1578                          *      we unlock it and try to collapse.
1579                          *
1580                          *      Note that we copy the page even if we didn't
1581                          *      need to... that's the breaks.
1582                          */
1583
1584                         /*
1585                          *      Allocate a page for the copy
1586                          */
1587                         copy_m = vm_page_grab();
1588                         if (copy_m == VM_PAGE_NULL) {
1589                                 RELEASE_PAGE(m);
1590                                 vm_fault_cleanup(object, first_m);
1591                                 cur_thread->interruptible = interruptible_state;
1592                                 return(VM_FAULT_MEMORY_SHORTAGE);
1593                         }
1594
1595
1596                         XPR(XPR_VM_FAULT,
1597                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1598                                 (integer_t)object, offset,
1599                                 (integer_t)m, (integer_t)copy_m, 0);
1600                         vm_page_copy(m, copy_m);
1601
1602                         /*
1603                          *      If another map is truly sharing this
1604                          *      page with us, we have to flush all
1605                          *      uses of the original page, since we
1606                          *      can't distinguish those which want the
1607                          *      original from those which need the
1608                          *      new copy.
1609                          *
1610                          *      XXXO If we know that only one map has
1611                          *      access to this page, then we could
1612                          *      avoid the pmap_page_protect() call.
1613                          */
1614
1615                         vm_page_lock_queues();
1616                         assert(!m->cleaning);
1617                         pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1618                         vm_page_deactivate(m);
1619                         copy_m->dirty = TRUE;
1620                         /*
1621                          * Setting reference here prevents this fault from
1622                          * being counted as a (per-thread) reactivate as well
1623                          * as a copy-on-write.
1624                          */
1625                         first_m->reference = TRUE;
1626                         vm_page_unlock_queues();
1627
1628                         /*
1629                          *      We no longer need the old page or object.
1630                          */
1631
1632                         PAGE_WAKEUP_DONE(m);
1633                         vm_object_paging_end(object);
1634                         vm_object_unlock(object);
1635
1636                         if (type_of_fault)
1637                                 *type_of_fault = DBG_COW_FAULT;
1638                         VM_STAT(cow_faults++);
1639                         current_task()->cow_faults++;
1640                         object = first_object;
1641                         offset = first_offset;
1642
1643                         vm_object_lock(object);
1644                         VM_PAGE_FREE(first_m);
1645                         first_m = VM_PAGE_NULL;
1646                         assert(copy_m->busy);
1647                         vm_page_insert(copy_m, object, offset);
1648                         m = copy_m;
1649
1650                         /*
1651                          *      Now that we've gotten the copy out of the
1652                          *      way, let's try to collapse the top object.
1653                          *      But we have to play ugly games with
1654                          *      paging_in_progress to do that...
1655                          */
1656
1657                         vm_object_paging_end(object);
1658                         vm_object_collapse(object);
1659                         vm_object_paging_begin(object);
1660
1661                 }
1662                 else {
1663                         *protection &= (~VM_PROT_WRITE);
1664                 }
1665         }
1666
1667         /*
1668          *      Now check whether the page needs to be pushed into the
1669          *      copy object.  The use of asymmetric copy on write for
1670          *      shared temporary objects means that we may do two copies to
1671          *      satisfy the fault; one above to get the page from a
1672          *      shadowed object, and one here to push it into the copy.
1673          */
1674
1675         while (first_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
1676                (copy_object = first_object->copy) != VM_OBJECT_NULL &&
1677                    (m!= VM_PAGE_NULL)) {
1678                 vm_object_offset_t      copy_offset;
1679                 vm_page_t               copy_m;
1680
1681 #if TRACEFAULTPAGE
1682                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1683 #endif
1684                 /*
1685                  *      If the page is being written, but hasn't been
1686                  *      copied to the copy-object, we have to copy it there.
1687                  */
1688
1689                 if ((fault_type & VM_PROT_WRITE) == 0) {
1690                         *protection &= ~VM_PROT_WRITE;
1691                         break;
1692                 }
1693
1694                 /*
1695                  *      If the page was guaranteed to be resident,
1696                  *      we must have already performed the copy.
1697                  */
1698
1699                 if (must_be_resident)
1700                         break;
1701
1702                 /*
1703                  *      Try to get the lock on the copy_object.
1704                  */
1705                 if (!vm_object_lock_try(copy_object)) {
1706                         vm_object_unlock(object);
1707
1708                         mutex_pause();  /* wait a bit */
1709
1710                         vm_object_lock(object);
1711                         continue;
1712                 }
1713
1714                 /*
1715                  *      Make another reference to the copy-object,
1716                  *      to keep it from disappearing during the
1717                  *      copy.
1718                  */
1719                 assert(copy_object->ref_count > 0);
1720                 copy_object->ref_count++;
1721                 VM_OBJ_RES_INCR(copy_object);
1722
1723                 /*
1724                  *      Does the page exist in the copy?
1725                  */
1726                 copy_offset = first_offset - copy_object->shadow_offset;
1727                 if (copy_object->size <= copy_offset)
1728                         /*
1729                          * Copy object doesn't cover this page -- do nothing.
1730                          */
1731                         ;
1732                 else if ((copy_m =
1733                         vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1734                         /* Page currently exists in the copy object */
1735                         if (copy_m->busy) {
1736                                 /*
1737                                  *      If the page is being brought
1738                                  *      in, wait for it and then retry.
1739                                  */
1740                                 RELEASE_PAGE(m);
1741                                 /* take an extra ref so object won't die */
1742                                 assert(copy_object->ref_count > 0);
1743                                 copy_object->ref_count++;
1744                                 vm_object_res_reference(copy_object);
1745                                 vm_object_unlock(copy_object);
1746                                 vm_fault_cleanup(object, first_m);
1747                                 counter(c_vm_fault_page_block_backoff_kernel++);
1748                                 vm_object_lock(copy_object);
1749                                 assert(copy_object->ref_count > 0);
1750                                 VM_OBJ_RES_DECR(copy_object);
1751                                 copy_object->ref_count--;
1752                                 assert(copy_object->ref_count > 0);
1753                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1754                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1755                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1756                                         vm_object_unlock(copy_object);
1757                                         wait_result = thread_block((void (*)(void))0);
1758                                         vm_object_deallocate(copy_object);
1759                                         goto backoff;
1760                                 } else {
1761                                         vm_object_unlock(copy_object);
1762                                         vm_object_deallocate(copy_object);
1763                                         cur_thread->interruptible = interruptible_state;
1764                                         return VM_FAULT_RETRY;
1765                                 }
1766                         }
1767                 }
1768                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1769                         /*
1770                          * If PAGED_OUT is TRUE, then the page used to exist
1771                          * in the copy-object, and has already been paged out.
1772                          * We don't need to repeat this. If PAGED_OUT is
1773                          * FALSE, then either we don't know (!pager_created,
1774                          * for example) or it hasn't been paged out.
1775                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1776                          * We must copy the page to the copy object.
1777                          */
1778
1779                         /*
1780                          *      Allocate a page for the copy
1781                          */
1782                         copy_m = vm_page_alloc(copy_object, copy_offset);
1783                         if (copy_m == VM_PAGE_NULL) {
1784                                 RELEASE_PAGE(m);
1785                                 VM_OBJ_RES_DECR(copy_object);
1786                                 copy_object->ref_count--;
1787                                 assert(copy_object->ref_count > 0);
1788                                 vm_object_unlock(copy_object);
1789                                 vm_fault_cleanup(object, first_m);
1790                                 cur_thread->interruptible = interruptible_state;
1791                                 return(VM_FAULT_MEMORY_SHORTAGE);
1792                         }
1793
1794                         /*
1795                          *      Must copy page into copy-object.
1796                          */
1797
1798                         vm_page_copy(m, copy_m);
1799
1800                         /*
1801                          *      If the old page was in use by any users
1802                          *      of the copy-object, it must be removed
1803                          *      from all pmaps.  (We can't know which
1804                          *      pmaps use it.)
1805                          */
1806
1807                         vm_page_lock_queues();
1808                         assert(!m->cleaning);
1809                         pmap_page_protect(m->phys_addr, VM_PROT_NONE);
1810                         copy_m->dirty = TRUE;
1811                         vm_page_unlock_queues();
1812
1813                         /*
1814                          *      If there's a pager, then immediately
1815                          *      page out this page, using the "initialize"
1816                          *      option.  Else, we use the copy.
1817                          */
1818
1819                         if
1820 #if     MACH_PAGEMAP
1821                           ((!copy_object->pager_created) ||
1822                                 vm_external_state_get(
1823                                         copy_object->existence_map, copy_offset)
1824                                 == VM_EXTERNAL_STATE_ABSENT)
1825 #else
1826                           (!copy_object->pager_created)
1827 #endif
1828                                 {
1829                                 vm_page_lock_queues();
1830                                 vm_page_activate(copy_m);
1831                                 vm_page_unlock_queues();
1832                                 PAGE_WAKEUP_DONE(copy_m);
1833                         }
1834                         else {
1835                                 assert(copy_m->busy == TRUE);
1836
1837                                 /*
1838                                  *      The page is already ready for pageout:
1839                                  *      not on pageout queues and busy.
1840                                  *      Unlock everything except the
1841                                  *      copy_object itself.
1842                                  */
1843
1844                                 vm_object_unlock(object);
1845
1846                                 /*
1847                                  *      Write the page to the copy-object,
1848                                  *      flushing it from the kernel.
1849                                  */
1850
1851                                 vm_pageout_initialize_page(copy_m);
1852
1853                                 /*
1854                                  *      Since the pageout may have
1855                                  *      temporarily dropped the
1856                                  *      copy_object's lock, we
1857                                  *      check whether we'll have
1858                                  *      to deallocate the hard way.
1859                                  */
1860
1861                                 if ((copy_object->shadow != object) ||
1862                                     (copy_object->ref_count == 1)) {
1863                                         vm_object_unlock(copy_object);
1864                                         vm_object_deallocate(copy_object);
1865                                         vm_object_lock(object);
1866                                         continue;
1867                                 }
1868
1869                                 /*
1870                                  *      Pick back up the old object's
1871                                  *      lock.  [It is safe to do so,
1872                                  *      since it must be deeper in the
1873                                  *      object tree.]
1874                                  */
1875
1876                                 vm_object_lock(object);
1877                         }
1878
1879                         /*
1880                          *      Because we're pushing a page upward
1881                          *      in the object tree, we must restart
1882                          *      any faults that are waiting here.
1883                          *      [Note that this is an expansion of
1884                          *      PAGE_WAKEUP that uses the THREAD_RESTART
1885                          *      wait result].  Can't turn off the page's
1886                          *      busy bit because we're not done with it.
1887                          */
1888
1889                         if (m->wanted) {
1890                                 m->wanted = FALSE;
1891                                 thread_wakeup_with_result((event_t) m,
1892                                         THREAD_RESTART);
1893                         }
1894                 }
1895
1896                 /*
1897                  *      The reference count on copy_object must be
1898                  *      at least 2: one for our extra reference,
1899                  *      and at least one from the outside world
1900                  *      (we checked that when we last locked
1901                  *      copy_object).
1902                  */
1903                 copy_object->ref_count--;
1904                 assert(copy_object->ref_count > 0);
1905                 VM_OBJ_RES_DECR(copy_object);
1906                 vm_object_unlock(copy_object);
1907
1908                 break;
1909         }
1910
1911         *result_page = m;
1912         *top_page = first_m;
1913
1914         XPR(XPR_VM_FAULT,
1915                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1916                 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1917         /*
1918          *      If the page can be written, assume that it will be.
1919          *      [Earlier, we restrict the permission to allow write
1920          *      access only if the fault so required, so we don't
1921          *      mark read-only data as dirty.]
1922          */
1923
1924 #if     !VM_FAULT_STATIC_CONFIG
1925         if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE) &&
1926                         (m != VM_PAGE_NULL)) {
1927                 m->dirty = TRUE;
1928         }
1929 #endif
1930 #if TRACEFAULTPAGE
1931         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_page_deactivate_behind);  /* (TEST/DEBUG) */
1932 #endif
1933         if (vm_page_deactivate_behind) {
1934                 if (offset && /* don't underflow */
1935                         (object->last_alloc == (offset - PAGE_SIZE_64))) {
1936                         m = vm_page_lookup(object, object->last_alloc);
1937                         if ((m != VM_PAGE_NULL) && !m->busy) {
1938                                 vm_page_lock_queues();
1939                                 vm_page_deactivate(m);
1940                                 vm_page_unlock_queues();
1941                         }
1942 #if TRACEFAULTPAGE
1943                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1944 #endif
1945                 }
1946                 object->last_alloc = offset;
1947         }
1948 #if TRACEFAULTPAGE
1949         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
1950 #endif
1951         cur_thread->interruptible = interruptible_state;
1952         if(*result_page == VM_PAGE_NULL) {
1953                 vm_object_unlock(object);
1954         }
1955         return(VM_FAULT_SUCCESS);
1956
1957 #if 0
1958     block_and_backoff:
1959         vm_fault_cleanup(object, first_m);
1960
1961         counter(c_vm_fault_page_block_backoff_kernel++);
1962         thread_block((void (*)(void))0);
1963 #endif
1964
1965     backoff:
1966         cur_thread->interruptible = interruptible_state;
1967         if (wait_result == THREAD_INTERRUPTED)
1968                 return VM_FAULT_INTERRUPTED;
1969         return VM_FAULT_RETRY;
1970
1971 #undef  RELEASE_PAGE
1972 }
1973
1974 /*
1975  *      Routine:        vm_fault
1976  *      Purpose:
1977  *              Handle page faults, including pseudo-faults
1978  *              used to change the wiring status of pages.
1979  *      Returns:
1980  *              Explicit continuations have been removed.
1981  *      Implementation:
1982  *              vm_fault and vm_fault_page save mucho state
1983  *              in the moral equivalent of a closure.  The state
1984  *              structure is allocated when first entering vm_fault
1985  *              and deallocated when leaving vm_fault.
1986  */
1987
1988 kern_return_t
1989 vm_fault(
1990         vm_map_t        map,
1991         vm_offset_t     vaddr,
1992         vm_prot_t       fault_type,
1993         boolean_t       change_wiring,
1994         int             interruptible)
1995 {
1996         vm_map_version_t        version;        /* Map version for verificiation */
1997         boolean_t               wired;          /* Should mapping be wired down? */
1998         vm_object_t             object;         /* Top-level object */
1999         vm_object_offset_t      offset;         /* Top-level offset */
2000         vm_prot_t               prot;           /* Protection for mapping */
2001         vm_behavior_t           behavior;       /* Expected paging behavior */
2002         vm_object_offset_t      lo_offset, hi_offset;
2003         vm_object_t             old_copy_object; /* Saved copy object */
2004         vm_page_t               result_page;    /* Result of vm_fault_page */
2005         vm_page_t               top_page;       /* Placeholder page */
2006         kern_return_t           kr;
2007
2008         register
2009         vm_page_t               m;      /* Fast access to result_page */
2010         kern_return_t           error_code;     /* page error reasons */
2011         register
2012         vm_object_t             cur_object;
2013         register
2014         vm_object_offset_t      cur_offset;
2015         vm_page_t               cur_m;
2016         vm_object_t             new_object;
2017         int                     type_of_fault;
2018         vm_map_t                pmap_map = map;
2019         vm_map_t                original_map = map;
2020         pmap_t                  pmap = NULL;
2021         boolean_t               funnel_set = FALSE;
2022         funnel_t                *curflock;
2023         thread_t                cur_thread;
2024         boolean_t               interruptible_state;
2025
2026
2027         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_START,
2028                               vaddr,
2029                               0,
2030                               0,
2031                               0,
2032                               0);
2033
2034         cur_thread = current_thread();
2035
2036         interruptible_state = cur_thread->interruptible;
2037         if (interruptible == THREAD_UNINT)
2038                 cur_thread->interruptible = FALSE;
2039
2040         /*
2041          * assume we will hit a page in the cache
2042          * otherwise, explicitly override with
2043          * the real fault type once we determine it
2044          */
2045         type_of_fault = DBG_CACHE_HIT_FAULT;
2046
2047         VM_STAT(faults++);
2048         current_task()->faults++;
2049
2050         /*
2051          * drop funnel if it is already held. Then restore while returning
2052          */
2053         if ((cur_thread->funnel_state & TH_FN_OWNED) == TH_FN_OWNED) {
2054                 funnel_set = TRUE;
2055                 curflock = cur_thread->funnel_lock;
2056                 thread_funnel_set( curflock , FALSE);
2057         }
2058
2059     RetryFault: ;
2060
2061         /*
2062          *      Find the backing store object and offset into
2063          *      it to begin the search.
2064          */
2065         map = original_map;
2066         vm_map_lock_read(map);
2067         kr = vm_map_lookup_locked(&map, vaddr, fault_type, &version,
2068                                 &object, &offset,
2069                                 &prot, &wired,
2070                                 &behavior, &lo_offset, &hi_offset, &pmap_map);
2071
2072         pmap = pmap_map->pmap;
2073
2074         if (kr != KERN_SUCCESS) {
2075                 vm_map_unlock_read(map);
2076                 goto done;
2077         }
2078
2079         /*
2080          *      If the page is wired, we must fault for the current protection
2081          *      value, to avoid further faults.
2082          */
2083
2084         if (wired)
2085                 fault_type = prot | VM_PROT_WRITE;
2086
2087 #if     VM_FAULT_CLASSIFY
2088         /*
2089          *      Temporary data gathering code
2090          */
2091         vm_fault_classify(object, offset, fault_type);
2092 #endif
2093         /*
2094          *      Fast fault code.  The basic idea is to do as much as
2095          *      possible while holding the map lock and object locks.
2096          *      Busy pages are not used until the object lock has to
2097          *      be dropped to do something (copy, zero fill, pmap enter).
2098          *      Similarly, paging references aren't acquired until that
2099          *      point, and object references aren't used.
2100          *
2101          *      If we can figure out what to do
2102          *      (zero fill, copy on write, pmap enter) while holding
2103          *      the locks, then it gets done.  Otherwise, we give up,
2104          *      and use the original fault path (which doesn't hold
2105          *      the map lock, and relies on busy pages).
2106          *      The give up cases include:
2107          *              - Have to talk to pager.
2108          *              - Page is busy, absent or in error.
2109          *              - Pager has locked out desired access.
2110          *              - Fault needs to be restarted.
2111          *              - Have to push page into copy object.
2112          *
2113          *      The code is an infinite loop that moves one level down
2114          *      the shadow chain each time.  cur_object and cur_offset
2115          *      refer to the current object being examined. object and offset
2116          *      are the original object from the map.  The loop is at the
2117          *      top level if and only if object and cur_object are the same.
2118          *
2119          *      Invariants:  Map lock is held throughout.  Lock is held on
2120          *              original object and cur_object (if different) when
2121          *              continuing or exiting loop.
2122          *
2123          */
2124
2125
2126         /*
2127          *      If this page is to be inserted in a copy delay object
2128          *      for writing, and if the object has a copy, then the
2129          *      copy delay strategy is implemented in the slow fault page.
2130          */
2131         if (object->copy_strategy != MEMORY_OBJECT_COPY_DELAY ||
2132             object->copy == VM_OBJECT_NULL ||
2133             (fault_type & VM_PROT_WRITE) == 0) {
2134         cur_object = object;
2135         cur_offset = offset;
2136
2137         while (TRUE) {
2138                 m = vm_page_lookup(cur_object, cur_offset);
2139                 if (m != VM_PAGE_NULL) {
2140                         if (m->busy)
2141                                 break;
2142
2143                         if (m->unusual && (m->error || m->restart || m->private
2144                             || m->absent || (fault_type & m->page_lock))) {
2145
2146                         /*
2147                                  *      Unusual case. Give up.
2148                                  */
2149                                 break;
2150                         }
2151
2152                         /*
2153                          *      Two cases of map in faults:
2154                          *          - At top level w/o copy object.
2155                          *          - Read fault anywhere.
2156                          *              --> must disallow write.
2157                          */
2158
2159                         if (object == cur_object &&
2160                             object->copy == VM_OBJECT_NULL)
2161                                 goto FastMapInFault;
2162
2163                         if ((fault_type & VM_PROT_WRITE) == 0) {
2164
2165                                 prot &= ~VM_PROT_WRITE;
2166
2167                                 /*
2168                                  *      Set up to map the page ...
2169                                  *      mark the page busy, drop
2170                                  *      locks and take a paging reference
2171                                  *      on the object with the page.
2172                                  */
2173
2174                                 if (object != cur_object) {
2175                                         vm_object_unlock(object);
2176                                         object = cur_object;
2177                                 }
2178 FastMapInFault:
2179                                 m->busy = TRUE;
2180
2181                                 vm_object_paging_begin(object);
2182                                 vm_object_unlock(object);
2183
2184 FastPmapEnter:
2185                                 /*
2186                                  *      Check a couple of global reasons to
2187                                  *      be conservative about write access.
2188                                  *      Then do the pmap_enter.
2189                                  */
2190 #if     !VM_FAULT_STATIC_CONFIG
2191                                 if (vm_fault_dirty_handling
2192 #if     MACH_KDB
2193                                     || db_watchpoint_list
2194 #endif
2195                                     && (fault_type & VM_PROT_WRITE) == 0)
2196                                         prot &= ~VM_PROT_WRITE;
2197 #else   /* STATIC_CONFIG */
2198 #if     MACH_KDB
2199                                 if (db_watchpoint_list
2200                                     && (fault_type & VM_PROT_WRITE) == 0)
2201                                         prot &= ~VM_PROT_WRITE;
2202 #endif  /* MACH_KDB */
2203 #endif  /* STATIC_CONFIG */
2204                                 if (m->no_isync == TRUE)
2205                                         pmap_sync_caches_phys(m->phys_addr);
2206
2207                                 PMAP_ENTER(pmap, vaddr, m, prot, wired);
2208                                 {
2209                                    tws_hash_line_t      line;
2210                                    task_t               task;
2211
2212                                    task = current_task();
2213                                    if((map != NULL) &&
2214                                         (task->dynamic_working_set != 0)) {
2215                                         if(tws_lookup
2216                                                 ((tws_hash_t)
2217                                                 task->dynamic_working_set,
2218                                                 cur_offset, object,
2219                                                 &line) != KERN_SUCCESS) {
2220                                                 if(tws_insert((tws_hash_t)
2221                                                    task->dynamic_working_set,
2222                                                    m->offset, m->object,
2223                                                    vaddr, pmap_map)
2224                                                         == KERN_NO_SPACE) {
2225                                                    tws_expand_working_set(
2226                                                       task->dynamic_working_set,
2227                                                       TWS_HASH_LINE_COUNT);
2228                                                 }
2229                                         }
2230                                    }
2231                                 }
2232                                 /*
2233                                  *      Grab the object lock to manipulate
2234                                  *      the page queues.  Change wiring
2235                                  *      case is obvious.  In soft ref bits
2236                                  *      case activate page only if it fell
2237                                  *      off paging queues, otherwise just
2238                                  *      activate it if it's inactive.
2239                                  *
2240                                  *      NOTE: original vm_fault code will
2241                                  *      move active page to back of active
2242                                  *      queue.  This code doesn't.
2243                                  */
2244                                 vm_object_lock(object);
2245                                 vm_page_lock_queues();
2246
2247                                 if (m->clustered) {
2248                                         vm_pagein_cluster_used++;
2249                                         m->clustered = FALSE;
2250                                 }
2251                                 /*
2252                                  * we did the isync above (if needed)... we're clearing
2253                                  * the flag here to avoid holding a lock
2254                                  * while calling pmap functions, however
2255                                  * we need hold the object lock before
2256                                  * we can modify the flag
2257                                  */
2258                                 m->no_isync = FALSE;
2259                                 m->reference = TRUE;
2260
2261                                 if (change_wiring) {
2262                                         if (wired)
2263                                                 vm_page_wire(m);
2264                                         else
2265                                                 vm_page_unwire(m);
2266                                 }
2267 #if VM_FAULT_STATIC_CONFIG
2268                                 else {
2269                                         if (!m->active && !m->inactive)
2270                                                 vm_page_activate(m);
2271                                 }
2272 #else
2273                                 else if (software_reference_bits) {
2274                                         if (!m->active && !m->inactive)
2275                                                 vm_page_activate(m);
2276                                 }
2277                                 else if (!m->active) {
2278                                         vm_page_activate(m);
2279                                 }
2280 #endif
2281                                 vm_page_unlock_queues();
2282
2283                                 /*
2284                                  *      That's it, clean up and return.
2285                                  */
2286                                 PAGE_WAKEUP_DONE(m);
2287                                 vm_object_paging_end(object);
2288                                 vm_object_unlock(object);
2289                                 vm_map_unlock_read(map);
2290                                 if(pmap_map != map)
2291                                         vm_map_unlock(pmap_map);
2292
2293                                 if (funnel_set) {
2294                                         thread_funnel_set( curflock, TRUE);
2295                                         funnel_set = FALSE;
2296                                 }
2297                                 cur_thread->interruptible = interruptible_state;
2298
2299                                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2300                                                       vaddr,
2301                                                       type_of_fault,
2302                                                       KERN_SUCCESS,
2303                                                       0,
2304                                                       0);
2305                                 return KERN_SUCCESS;
2306                         }
2307
2308                         /*
2309                          *      Copy on write fault.  If objects match, then
2310                          *      object->copy must not be NULL (else control
2311                          *      would be in previous code block), and we
2312                          *      have a potential push into the copy object
2313                          *      with which we won't cope here.
2314                          */
2315
2316                         if (cur_object == object)
2317                                 break;
2318
2319                         /*
2320                          *      This is now a shadow based copy on write
2321                          *      fault -- it requires a copy up the shadow
2322                          *      chain.
2323                          *
2324                          *      Allocate a page in the original top level
2325                          *      object. Give up if allocate fails.  Also
2326                          *      need to remember current page, as it's the
2327                          *      source of the copy.
2328                          */
2329                         cur_m = m;
2330                         m = vm_page_grab();
2331                         if (m == VM_PAGE_NULL) {
2332                                 break;
2333                         }
2334
2335                         /*
2336                          *      Now do the copy.  Mark the source busy
2337                          *      and take out paging references on both
2338                          *      objects.
2339                          *
2340                          *      NOTE: This code holds the map lock across
2341                          *      the page copy.
2342                          */
2343
2344                         cur_m->busy = TRUE;
2345                         vm_page_copy(cur_m, m);
2346                         vm_page_insert(m, object, offset);
2347
2348                         vm_object_paging_begin(cur_object);
2349                         vm_object_paging_begin(object);
2350
2351                         type_of_fault = DBG_COW_FAULT;
2352                         VM_STAT(cow_faults++);
2353                         current_task()->cow_faults++;
2354
2355                         /*
2356                          *      Now cope with the source page and object
2357                          *      If the top object has a ref count of 1
2358                          *      then no other map can access it, and hence
2359                          *      it's not necessary to do the pmap_page_protect.
2360                          */
2361
2362
2363                         vm_page_lock_queues();
2364                         vm_page_deactivate(cur_m);
2365                         m->dirty = TRUE;
2366                         pmap_page_protect(cur_m->phys_addr,
2367                                                   VM_PROT_NONE);
2368                         vm_page_unlock_queues();
2369
2370                         PAGE_WAKEUP_DONE(cur_m);
2371                         vm_object_paging_end(cur_object);
2372                         vm_object_unlock(cur_object);
2373
2374                         /*
2375                          *      Slight hack to call vm_object collapse
2376                          *      and then reuse common map in code.
2377                          *      note that the object lock was taken above.
2378                          */
2379
2380                         vm_object_paging_end(object);
2381                         vm_object_collapse(object);
2382                         vm_object_paging_begin(object);
2383                         vm_object_unlock(object);
2384
2385                         goto FastPmapEnter;
2386                 }
2387                 else {
2388
2389                         /*
2390                          *      No page at cur_object, cur_offset
2391                          */
2392
2393                         if (cur_object->pager_created) {
2394
2395                                 /*
2396                                  *      Have to talk to the pager.  Give up.
2397                                  */
2398
2399                                 break;
2400                         }
2401
2402
2403                         if (cur_object->shadow == VM_OBJECT_NULL) {
2404
2405                                 if (cur_object->shadow_severed) {
2406                                         vm_object_paging_end(object);
2407                                         vm_object_unlock(object);
2408                                         vm_map_unlock_read(map);
2409                                         if(pmap_map != map)
2410                                                 vm_map_unlock(pmap_map);
2411
2412                                         if (funnel_set) {
2413                                                 thread_funnel_set( curflock, TRUE);
2414                                                 funnel_set = FALSE;
2415                                         }
2416                                         cur_thread->interruptible = interruptible_state;
2417
2418                                         return VM_FAULT_MEMORY_ERROR;
2419                                 }
2420
2421                                 /*
2422                                  *      Zero fill fault.  Page gets
2423                                  *      filled in top object. Insert
2424                                  *      page, then drop any lower lock.
2425                                  *      Give up if no page.
2426                                  */
2427                                 if ((vm_page_free_target -
2428                                    ((vm_page_free_target-vm_page_free_min)>>2))
2429                                                 > vm_page_free_count) {
2430                                         break;
2431                                 }
2432                                 m = vm_page_alloc(object, offset);
2433                                 if (m == VM_PAGE_NULL) {
2434                                         break;
2435                                 }
2436                                 /*
2437                                  * This is a zero-fill or initial fill
2438                                  * page fault.  As such, we consider it
2439                                  * undefined with respect to instruction
2440                                  * execution.  i.e. it is the responsibility
2441                                  * of higher layers to call for an instruction
2442                                  * sync after changing the contents and before
2443                                  * sending a program into this area.  We
2444                                  * choose this approach for performance
2445                                  */
2446
2447                                 m->no_isync = FALSE;
2448
2449                                 if (cur_object != object)
2450                                         vm_object_unlock(cur_object);
2451
2452                                 vm_object_paging_begin(object);
2453                                 vm_object_unlock(object);
2454
2455                                 /*
2456                                  *      Now zero fill page and map it.
2457                                  *      the page is probably going to
2458                                  *      be written soon, so don't bother
2459                                  *      to clear the modified bit
2460                                  *
2461                                  *      NOTE: This code holds the map
2462                                  *      lock across the zero fill.
2463                                  */
2464
2465                                 if (!map->no_zero_fill) {
2466                                         vm_page_zero_fill(m);
2467                                         type_of_fault = DBG_ZERO_FILL_FAULT;
2468                                         VM_STAT(zero_fill_count++);
2469                                 }
2470                                 vm_page_lock_queues();
2471                                 VM_PAGE_QUEUES_REMOVE(m);
2472
2473                                 m->page_ticket = vm_page_ticket;
2474                                 vm_page_ticket_roll++;
2475                                 if(vm_page_ticket_roll ==
2476                                                 VM_PAGE_TICKETS_IN_ROLL) {
2477                                         vm_page_ticket_roll = 0;
2478                                         if(vm_page_ticket ==
2479                                                 VM_PAGE_TICKET_ROLL_IDS)
2480                                                 vm_page_ticket= 0;
2481                                         else
2482                                                 vm_page_ticket++;
2483                                 }
2484
2485                                 queue_enter(&vm_page_queue_inactive,
2486                                                         m, vm_page_t, pageq);
2487                                 m->inactive = TRUE;
2488                                 vm_page_inactive_count++;
2489                                 vm_page_unlock_queues();
2490                                 goto FastPmapEnter;
2491                         }
2492
2493                         /*
2494                          *      On to the next level
2495                          */
2496
2497                         cur_offset += cur_object->shadow_offset;
2498                         new_object = cur_object->shadow;
2499                         vm_object_lock(new_object);
2500                         if (cur_object != object)
2501                                 vm_object_unlock(cur_object);
2502                         cur_object = new_object;
2503
2504                         continue;
2505                 }
2506         }
2507
2508         /*
2509          *      Cleanup from fast fault failure.  Drop any object
2510          *      lock other than original and drop map lock.
2511          */
2512
2513         if (object != cur_object)
2514                 vm_object_unlock(cur_object);
2515         }
2516         vm_map_unlock_read(map);
2517         if(pmap_map != map)
2518                 vm_map_unlock(pmap_map);
2519
2520         /*
2521          *      Make a reference to this object to
2522          *      prevent its disposal while we are messing with
2523          *      it.  Once we have the reference, the map is free
2524          *      to be diddled.  Since objects reference their
2525          *      shadows (and copies), they will stay around as well.
2526          */
2527
2528         assert(object->ref_count > 0);
2529         object->ref_count++;
2530         vm_object_res_reference(object);
2531         vm_object_paging_begin(object);
2532
2533         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2534         kr = vm_fault_page(object, offset, fault_type,
2535                            (change_wiring && !wired),
2536                            interruptible,
2537                            lo_offset, hi_offset, behavior,
2538                            &prot, &result_page, &top_page,
2539                            &type_of_fault,
2540                            &error_code, map->no_zero_fill, FALSE, map, vaddr);
2541
2542         /*
2543          *      If we didn't succeed, lose the object reference immediately.
2544          */
2545
2546         if (kr != VM_FAULT_SUCCESS)
2547                 vm_object_deallocate(object);
2548
2549         /*
2550          *      See why we failed, and take corrective action.
2551          */
2552
2553         switch (kr) {
2554                 case VM_FAULT_SUCCESS:
2555                         break;
2556                 case VM_FAULT_MEMORY_SHORTAGE:
2557                         if (vm_page_wait((change_wiring) ?
2558                                          THREAD_UNINT :
2559                                          THREAD_ABORTSAFE))
2560                                 goto RetryFault;
2561                         /* fall thru */
2562                 case VM_FAULT_INTERRUPTED:
2563                         kr = KERN_ABORTED;
2564                         goto done;
2565                 case VM_FAULT_RETRY:
2566                         goto RetryFault;
2567                 case VM_FAULT_FICTITIOUS_SHORTAGE:
2568                         vm_page_more_fictitious();
2569                         goto RetryFault;
2570                 case VM_FAULT_MEMORY_ERROR:
2571                         if (error_code)
2572                                 kr = error_code;
2573                         else
2574                                 kr = KERN_MEMORY_ERROR;
2575                         goto done;
2576         }
2577
2578         m = result_page;
2579
2580         if(m != VM_PAGE_NULL) {
2581                 assert((change_wiring && !wired) ?
2582                     (top_page == VM_PAGE_NULL) :
2583                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
2584         }
2585
2586         /*
2587          *      How to clean up the result of vm_fault_page.  This
2588          *      happens whether the mapping is entered or not.
2589          */
2590
2591 #define UNLOCK_AND_DEALLOCATE                           \
2592         MACRO_BEGIN                                     \
2593         vm_fault_cleanup(m->object, top_page);          \
2594         vm_object_deallocate(object);                   \
2595         MACRO_END
2596
2597         /*
2598          *      What to do with the resulting page from vm_fault_page
2599          *      if it doesn't get entered into the physical map:
2600          */
2601
2602 #define RELEASE_PAGE(m)                                 \
2603         MACRO_BEGIN                                     \
2604         PAGE_WAKEUP_DONE(m);                            \
2605         vm_page_lock_queues();                          \
2606         if (!m->active && !m->inactive)                 \
2607                 vm_page_activate(m);                    \
2608         vm_page_unlock_queues();                        \
2609         MACRO_END
2610
2611         /*
2612          *      We must verify that the maps have not changed
2613          *      since our last lookup.
2614          */
2615
2616         if(m != VM_PAGE_NULL) {
2617                 old_copy_object = m->object->copy;
2618
2619                 vm_object_unlock(m->object);
2620         } else {
2621                 old_copy_object = VM_OBJECT_NULL;
2622         }
2623         if ((map != original_map) || !vm_map_verify(map, &version)) {
2624                 vm_object_t             retry_object;
2625                 vm_object_offset_t      retry_offset;
2626                 vm_prot_t               retry_prot;
2627
2628                 /*
2629                  *      To avoid trying to write_lock the map while another
2630                  *      thread has it read_locked (in vm_map_pageable), we
2631                  *      do not try for write permission.  If the page is
2632                  *      still writable, we will get write permission.  If it
2633                  *      is not, or has been marked needs_copy, we enter the
2634                  *      mapping without write permission, and will merely
2635                  *      take another fault.
2636                  */
2637                 map = original_map;
2638                 vm_map_lock_read(map);
2639                 kr = vm_map_lookup_locked(&map, vaddr,
2640                                    fault_type & ~VM_PROT_WRITE, &version,
2641                                    &retry_object, &retry_offset, &retry_prot,
2642                                    &wired, &behavior, &lo_offset, &hi_offset,
2643                                    &pmap_map);
2644                 pmap = pmap_map->pmap;
2645
2646                 if (kr != KERN_SUCCESS) {
2647                         vm_map_unlock_read(map);
2648                         if(m != VM_PAGE_NULL) {
2649                                 vm_object_lock(m->object);
2650                                 RELEASE_PAGE(m);
2651                                 UNLOCK_AND_DEALLOCATE;
2652                         } else {
2653                                 vm_object_deallocate(object);
2654                         }
2655                         goto done;
2656                 }
2657
2658                 vm_object_unlock(retry_object);
2659                 if(m != VM_PAGE_NULL) {
2660                         vm_object_lock(m->object);
2661                 } else {
2662                         vm_object_lock(object);
2663                 }
2664
2665                 if ((retry_object != object) ||
2666                     (retry_offset != offset)) {
2667                         vm_map_unlock_read(map);
2668                         if(pmap_map != map)
2669                                 vm_map_unlock(pmap_map);
2670                         if(m != VM_PAGE_NULL) {
2671                                 RELEASE_PAGE(m);
2672                                 UNLOCK_AND_DEALLOCATE;
2673                         } else {
2674                                 vm_object_deallocate(object);
2675                         }
2676                         goto RetryFault;
2677                 }
2678
2679                 /*
2680                  *      Check whether the protection has changed or the object
2681                  *      has been copied while we left the map unlocked.
2682                  */
2683                 prot &= retry_prot;
2684                 if(m != VM_PAGE_NULL) {
2685                         vm_object_unlock(m->object);
2686                 } else {
2687                         vm_object_unlock(object);
2688                 }
2689         }
2690         if(m != VM_PAGE_NULL) {
2691                 vm_object_lock(m->object);
2692         } else {
2693                 vm_object_lock(object);
2694         }
2695
2696         /*
2697          *      If the copy object changed while the top-level object
2698          *      was unlocked, then we must take away write permission.
2699          */
2700
2701         if(m != VM_PAGE_NULL) {
2702                 if (m->object->copy != old_copy_object)
2703                         prot &= ~VM_PROT_WRITE;
2704         }
2705
2706         /*
2707          *      If we want to wire down this page, but no longer have
2708          *      adequate permissions, we must start all over.
2709          */
2710
2711         if (wired && (fault_type != (prot|VM_PROT_WRITE))) {
2712                 vm_map_verify_done(map, &version);
2713                 if(pmap_map != map)
2714                         vm_map_unlock(pmap_map);
2715                 if(m != VM_PAGE_NULL) {
2716                         RELEASE_PAGE(m);
2717                         UNLOCK_AND_DEALLOCATE;
2718                 } else {
2719                         vm_object_deallocate(object);
2720                 }
2721                 goto RetryFault;
2722         }
2723
2724         /*
2725          *      Put this page into the physical map.
2726          *      We had to do the unlock above because pmap_enter
2727          *      may cause other faults.  The page may be on
2728          *      the pageout queues.  If the pageout daemon comes
2729          *      across the page, it will remove it from the queues.
2730          */
2731         if (m != VM_PAGE_NULL) {
2732                 if (m->no_isync == TRUE) {
2733                         pmap_sync_caches_phys(m->phys_addr);
2734
2735                         m->no_isync = FALSE;
2736                 }
2737                 vm_object_unlock(m->object);
2738
2739                 PMAP_ENTER(pmap, vaddr, m, prot, wired);
2740                 {
2741                         tws_hash_line_t line;
2742                         task_t          task;
2743
2744                            task = current_task();
2745                            if((map != NULL) &&
2746                                 (task->dynamic_working_set != 0)) {
2747                                 if(tws_lookup
2748                                         ((tws_hash_t)
2749                                         task->dynamic_working_set,
2750                                         m->offset, m->object,
2751                                         &line) != KERN_SUCCESS) {
2752                                         tws_insert((tws_hash_t)
2753                                            task->dynamic_working_set,
2754                                            m->offset, m->object,
2755                                            vaddr, pmap_map);
2756                                         if(tws_insert((tws_hash_t)
2757                                                    task->dynamic_working_set,
2758                                                    m->offset, m->object,
2759                                                    vaddr, pmap_map)
2760                                                                 == KERN_NO_SPACE) {
2761                                                 tws_expand_working_set(
2762                                                         task->dynamic_working_set,
2763                                                         TWS_HASH_LINE_COUNT);
2764                                         }
2765                                 }
2766                         }
2767                 }
2768         } else {
2769
2770 /*  if __ppc__  not working until figure out phys copy on block maps */
2771 #ifdef notdefcdy
2772                 int     memattr;
2773                 struct  phys_entry      *pp;
2774                 /*
2775                  * do a pmap block mapping from the physical address
2776                  * in the object
2777                  */
2778                 if(pp = pmap_find_physentry(
2779                         (vm_offset_t)object->shadow_offset)) {
2780                         memattr = ((pp->pte1 & 0x00000078) >> 3);
2781                 } else {
2782                         memattr = PTE_WIMG_UNCACHED_COHERENT_GUARDED;
2783                 }
2784
2785                 pmap_map_block(pmap, vaddr,
2786                         (vm_offset_t)object->shadow_offset,
2787                         object->size, prot,
2788                         memattr, 0); /* Set up a block mapped area */
2789 //#else
2790                 vm_offset_t     off;
2791                 for (off = 0; off < object->size; off += page_size) {
2792                         pmap_enter(pmap, vaddr + off,
2793                                 object->shadow_offset + off, prot, TRUE);
2794                         /* Map it in */
2795                 }
2796 #endif
2797
2798         }
2799
2800         /*
2801          *      If the page is not wired down and isn't already
2802          *      on a pageout queue, then put it where the
2803          *      pageout daemon can find it.
2804          */
2805         if(m != VM_PAGE_NULL) {
2806                 vm_object_lock(m->object);
2807                 vm_page_lock_queues();
2808
2809                 if (change_wiring) {
2810                         if (wired)
2811                                 vm_page_wire(m);
2812                         else
2813                                 vm_page_unwire(m);
2814                 }
2815 #if     VM_FAULT_STATIC_CONFIG
2816                 else {
2817                         if (!m->active && !m->inactive)
2818                                 vm_page_activate(m);
2819                         m->reference = TRUE;
2820                 }
2821 #else
2822                 else if (software_reference_bits) {
2823                         if (!m->active && !m->inactive)
2824                                 vm_page_activate(m);
2825                         m->reference = TRUE;
2826                 } else {
2827                         vm_page_activate(m);
2828                 }
2829 #endif
2830                 vm_page_unlock_queues();
2831         }
2832
2833         /*
2834          *      Unlock everything, and return
2835          */
2836
2837         vm_map_verify_done(map, &version);
2838         if(pmap_map != map)
2839                 vm_map_unlock(pmap_map);
2840         if(m != VM_PAGE_NULL) {
2841                 PAGE_WAKEUP_DONE(m);
2842                 UNLOCK_AND_DEALLOCATE;
2843         } else {
2844                 vm_fault_cleanup(object, top_page);
2845                 vm_object_deallocate(object);
2846         }
2847         kr = KERN_SUCCESS;
2848
2849 #undef  UNLOCK_AND_DEALLOCATE
2850 #undef  RELEASE_PAGE
2851
2852     done:
2853         if (funnel_set) {
2854                 thread_funnel_set( curflock, TRUE);
2855                 funnel_set = FALSE;
2856         }
2857         cur_thread->interruptible = interruptible_state;
2858
2859         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 0)) | DBG_FUNC_END,
2860                               vaddr,
2861                               type_of_fault,
2862                               kr,
2863                               0,
2864                               0);
2865         return(kr);
2866 }
2867
2868 /*
2869  *      vm_fault_wire:
2870  *
2871  *      Wire down a range of virtual addresses in a map.
2872  */
2873 kern_return_t
2874 vm_fault_wire(
2875         vm_map_t        map,
2876         vm_map_entry_t  entry,
2877         pmap_t          pmap)
2878 {
2879
2880         register vm_offset_t    va;
2881         register vm_offset_t    end_addr = entry->vme_end;
2882         register kern_return_t  rc;
2883
2884         assert(entry->in_transition);
2885
2886         /*
2887          *      Inform the physical mapping system that the
2888          *      range of addresses may not fault, so that
2889          *      page tables and such can be locked down as well.
2890          */
2891
2892         pmap_pageable(pmap, entry->vme_start, end_addr, FALSE);
2893
2894         /*
2895          *      We simulate a fault to get the page and enter it
2896          *      in the physical map.
2897          */
2898
2899         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
2900                 if ((rc = vm_fault_wire_fast(
2901                                 map, va, entry, pmap)) != KERN_SUCCESS) {
2902                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
2903                                       (pmap == kernel_pmap) ? THREAD_UNINT : THREAD_ABORTSAFE);
2904                 }
2905
2906                 if (rc != KERN_SUCCESS) {
2907                         struct vm_map_entry     tmp_entry = *entry;
2908
2909                         /* unwire wired pages */
2910                         tmp_entry.vme_end = va;
2911                         vm_fault_unwire(map, &tmp_entry, FALSE, pmap);
2912
2913                         return rc;
2914                 }
2915         }
2916         return KERN_SUCCESS;
2917 }
2918
2919 /*
2920  *      vm_fault_unwire:
2921  *
2922  *      Unwire a range of virtual addresses in a map.
2923  */
2924 void
2925 vm_fault_unwire(
2926         vm_map_t        map,
2927         vm_map_entry_t  entry,
2928         boolean_t       deallocate,
2929         pmap_t          pmap)
2930 {
2931         register vm_offset_t    va;
2932         register vm_offset_t    end_addr = entry->vme_end;
2933         vm_object_t             object;
2934
2935         object = (entry->is_sub_map)
2936                         ? VM_OBJECT_NULL : entry->object.vm_object;
2937
2938         /*
2939          *      Since the pages are wired down, we must be able to
2940          *      get their mappings from the physical map system.
2941          */
2942
2943         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
2944                 pmap_change_wiring(pmap, va, FALSE);
2945
2946                 if (object == VM_OBJECT_NULL) {
2947                         (void) vm_fault(map, va, VM_PROT_NONE, TRUE, THREAD_UNINT);
2948                 } else {
2949                         vm_prot_t       prot;
2950                         vm_page_t       result_page;
2951                         vm_page_t       top_page;
2952                         vm_object_t     result_object;
2953                         vm_fault_return_t result;
2954
2955                         do {
2956                                 prot = VM_PROT_NONE;
2957
2958                                 vm_object_lock(object);
2959                                 vm_object_paging_begin(object);
2960                                 XPR(XPR_VM_FAULT,
2961                                         "vm_fault_unwire -> vm_fault_page\n",
2962                                         0,0,0,0,0);
2963                                 result = vm_fault_page(object,
2964                                                 entry->offset +
2965                                                   (va - entry->vme_start),
2966                                                 VM_PROT_NONE, TRUE,
2967                                                 THREAD_UNINT,
2968                                                 entry->offset,
2969                                                 entry->offset +
2970                                                        (entry->vme_end
2971                                                         - entry->vme_start),
2972                                                 entry->behavior,
2973                                                 &prot,
2974                                                 &result_page,
2975                                                 &top_page,
2976                                                 (int *)0,
2977                                                 0, map->no_zero_fill,
2978                                                 FALSE, NULL, 0);
2979                         } while (result == VM_FAULT_RETRY);
2980
2981                         if (result != VM_FAULT_SUCCESS)
2982                                 panic("vm_fault_unwire: failure");
2983
2984                         result_object = result_page->object;
2985                         if (deallocate) {
2986                                 assert(!result_page->fictitious);
2987                                 pmap_page_protect(result_page->phys_addr,
2988                                                 VM_PROT_NONE);
2989                                 VM_PAGE_FREE(result_page);
2990                         } else {
2991                                 vm_page_lock_queues();
2992                                 vm_page_unwire(result_page);
2993                                 vm_page_unlock_queues();
2994                                 PAGE_WAKEUP_DONE(result_page);
2995                         }
2996
2997                         vm_fault_cleanup(result_object, top_page);
2998                 }
2999         }
3000
3001         /*
3002          *      Inform the physical mapping system that the range
3003          *      of addresses may fault, so that page tables and
3004          *      such may be unwired themselves.
3005          */
3006
3007         pmap_pageable(pmap, entry->vme_start, end_addr, TRUE);
3008
3009 }
3010
3011 /*
3012  *      vm_fault_wire_fast:
3013  *
3014  *      Handle common case of a wire down page fault at the given address.
3015  *      If successful, the page is inserted into the associated physical map.
3016  *      The map entry is passed in to avoid the overhead of a map lookup.
3017  *
3018  *      NOTE: the given address should be truncated to the
3019  *      proper page address.
3020  *
3021  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
3022  *      a standard error specifying why the fault is fatal is returned.
3023  *
3024  *      The map in question must be referenced, and remains so.
3025  *      Caller has a read lock on the map.
3026  *
3027  *      This is a stripped version of vm_fault() for wiring pages.  Anything
3028  *      other than the common case will return KERN_FAILURE, and the caller
3029  *      is expected to call vm_fault().
3030  */
3031 kern_return_t
3032 vm_fault_wire_fast(
3033         vm_map_t        map,
3034         vm_offset_t     va,
3035         vm_map_entry_t  entry,
3036         pmap_t          pmap)
3037 {
3038         vm_object_t             object;
3039         vm_object_offset_t      offset;
3040         register vm_page_t      m;
3041         vm_prot_t               prot;
3042         thread_act_t            thr_act;
3043
3044         VM_STAT(faults++);
3045
3046         if((thr_act=current_act()) && (thr_act->task != TASK_NULL))
3047           thr_act->task->faults++;
3048
3049 /*
3050  *      Recovery actions
3051  */
3052
3053 #undef  RELEASE_PAGE
3054 #define RELEASE_PAGE(m) {                               \
3055         PAGE_WAKEUP_DONE(m);                            \
3056         vm_page_lock_queues();                          \
3057         vm_page_unwire(m);                              \
3058         vm_page_unlock_queues();                        \
3059 }
3060
3061
3062 #undef  UNLOCK_THINGS
3063 #define UNLOCK_THINGS   {                               \
3064         object->paging_in_progress--;                   \
3065         vm_object_unlock(object);                       \
3066 }
3067
3068 #undef  UNLOCK_AND_DEALLOCATE
3069 #define UNLOCK_AND_DEALLOCATE   {                       \
3070         UNLOCK_THINGS;                                  \
3071         vm_object_deallocate(object);                   \
3072 }
3073 /*
3074  *      Give up and have caller do things the hard way.
3075  */
3076
3077 #define GIVE_UP {                                       \
3078         UNLOCK_AND_DEALLOCATE;                          \
3079         return(KERN_FAILURE);                           \
3080 }
3081
3082
3083         /*
3084          *      If this entry is not directly to a vm_object, bail out.
3085          */
3086         if (entry->is_sub_map)
3087                 return(KERN_FAILURE);
3088
3089         /*
3090          *      Find the backing store object and offset into it.
3091          */
3092
3093         object = entry->object.vm_object;
3094         offset = (va - entry->vme_start) + entry->offset;
3095         prot = entry->protection;
3096
3097         /*
3098          *      Make a reference to this object to prevent its
3099          *      disposal while we are messing with it.
3100          */
3101
3102         vm_object_lock(object);
3103         assert(object->ref_count > 0);
3104         object->ref_count++;
3105         vm_object_res_reference(object);
3106         object->paging_in_progress++;
3107
3108         /*
3109          *      INVARIANTS (through entire routine):
3110          *
3111          *      1)      At all times, we must either have the object
3112          *              lock or a busy page in some object to prevent
3113          *              some other thread from trying to bring in
3114          *              the same page.
3115          *
3116          *      2)      Once we have a busy page, we must remove it from
3117          *              the pageout queues, so that the pageout daemon
3118          *              will not grab it away.
3119          *
3120          */
3121
3122         /*
3123          *      Look for page in top-level object.  If it's not there or
3124          *      there's something going on, give up.
3125          */
3126         m = vm_page_lookup(object, offset);
3127         if ((m == VM_PAGE_NULL) || (m->busy) ||
3128             (m->unusual && ( m->error || m->restart || m->absent ||
3129                                 prot & m->page_lock))) {
3130
3131                 GIVE_UP;
3132         }
3133
3134         /*
3135          *      Wire the page down now.  All bail outs beyond this
3136          *      point must unwire the page.
3137          */
3138
3139         vm_page_lock_queues();
3140         vm_page_wire(m);
3141         vm_page_unlock_queues();
3142
3143         /*
3144          *      Mark page busy for other threads.
3145          */
3146         assert(!m->busy);
3147         m->busy = TRUE;
3148         assert(!m->absent);
3149
3150         /*
3151          *      Give up if the page is being written and there's a copy object
3152          */
3153         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3154                 RELEASE_PAGE(m);
3155                 GIVE_UP;
3156         }
3157
3158         /*
3159          *      Put this page into the physical map.
3160          *      We have to unlock the object because pmap_enter
3161          *      may cause other faults.
3162          */
3163         if (m->no_isync == TRUE) {
3164                 pmap_sync_caches_phys(m->phys_addr);
3165
3166                 m->no_isync = FALSE;
3167         }
3168         vm_object_unlock(object);
3169
3170         PMAP_ENTER(pmap, va, m, prot, TRUE);
3171
3172         /*
3173          *      Must relock object so that paging_in_progress can be cleared.
3174          */
3175         vm_object_lock(object);
3176
3177         /*
3178          *      Unlock everything, and return
3179          */
3180
3181         PAGE_WAKEUP_DONE(m);
3182         UNLOCK_AND_DEALLOCATE;
3183
3184         return(KERN_SUCCESS);
3185
3186 }
3187
3188 /*
3189  *      Routine:        vm_fault_copy_cleanup
3190  *      Purpose:
3191  *              Release a page used by vm_fault_copy.
3192  */
3193
3194 void
3195 vm_fault_copy_cleanup(
3196         vm_page_t       page,
3197         vm_page_t       top_page)
3198 {
3199         vm_object_t     object = page->object;
3200
3201         vm_object_lock(object);
3202         PAGE_WAKEUP_DONE(page);
3203         vm_page_lock_queues();
3204         if (!page->active && !page->inactive)
3205                 vm_page_activate(page);
3206         vm_page_unlock_queues();
3207         vm_fault_cleanup(object, top_page);
3208 }
3209
3210 void
3211 vm_fault_copy_dst_cleanup(
3212         vm_page_t       page)
3213 {
3214         vm_object_t     object;
3215
3216         if (page != VM_PAGE_NULL) {
3217                 object = page->object;
3218                 vm_object_lock(object);
3219                 vm_page_lock_queues();
3220                 vm_page_unwire(page);
3221                 vm_page_unlock_queues();
3222                 vm_object_paging_end(object);
3223                 vm_object_unlock(object);
3224         }
3225 }
3226
3227 /*
3228  *      Routine:        vm_fault_copy
3229  *
3230  *      Purpose:
3231  *              Copy pages from one virtual memory object to another --
3232  *              neither the source nor destination pages need be resident.
3233  *
3234  *              Before actually copying a page, the version associated with
3235  *              the destination address map wil be verified.
3236  *
3237  *      In/out conditions:
3238  *              The caller must hold a reference, but not a lock, to
3239  *              each of the source and destination objects and to the
3240  *              destination map.
3241  *
3242  *      Results:
3243  *              Returns KERN_SUCCESS if no errors were encountered in
3244  *              reading or writing the data.  Returns KERN_INTERRUPTED if
3245  *              the operation was interrupted (only possible if the
3246  *              "interruptible" argument is asserted).  Other return values
3247  *              indicate a permanent error in copying the data.
3248  *
3249  *              The actual amount of data copied will be returned in the
3250  *              "copy_size" argument.  In the event that the destination map
3251  *              verification failed, this amount may be less than the amount
3252  *              requested.
3253  */
3254 kern_return_t
3255 vm_fault_copy(
3256         vm_object_t             src_object,
3257         vm_object_offset_t      src_offset,
3258         vm_size_t               *src_size,              /* INOUT */
3259         vm_object_t             dst_object,
3260         vm_object_offset_t      dst_offset,
3261         vm_map_t                dst_map,
3262         vm_map_version_t         *dst_version,
3263         int                     interruptible)
3264 {
3265         vm_page_t               result_page;
3266
3267         vm_page_t               src_page;
3268         vm_page_t               src_top_page;
3269         vm_prot_t               src_prot;
3270
3271         vm_page_t               dst_page;
3272         vm_page_t               dst_top_page;
3273         vm_prot_t               dst_prot;
3274
3275         vm_size_t               amount_left;
3276         vm_object_t             old_copy_object;
3277         kern_return_t           error = 0;
3278
3279         vm_size_t               part_size;
3280
3281         /*
3282          * In order not to confuse the clustered pageins, align
3283          * the different offsets on a page boundary.
3284          */
3285         vm_object_offset_t      src_lo_offset = trunc_page_64(src_offset);
3286         vm_object_offset_t      dst_lo_offset = trunc_page_64(dst_offset);
3287         vm_object_offset_t      src_hi_offset = round_page_64(src_offset + *src_size);
3288         vm_object_offset_t      dst_hi_offset = round_page_64(dst_offset + *src_size);
3289
3290 #define RETURN(x)                                       \
3291         MACRO_BEGIN                                     \
3292         *src_size -= amount_left;                       \
3293         MACRO_RETURN(x);                                \
3294         MACRO_END
3295
3296         amount_left = *src_size;
3297         do { /* while (amount_left > 0) */
3298                 /*
3299                  * There may be a deadlock if both source and destination
3300                  * pages are the same. To avoid this deadlock, the copy must
3301                  * start by getting the destination page in order to apply
3302                  * COW semantics if any.
3303                  */
3304
3305         RetryDestinationFault: ;
3306
3307                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3308
3309                 vm_object_lock(dst_object);
3310                 vm_object_paging_begin(dst_object);
3311
3312                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3313                 switch (vm_fault_page(dst_object,
3314                                       trunc_page_64(dst_offset),
3315                                       VM_PROT_WRITE|VM_PROT_READ,
3316                                       FALSE,
3317                                       interruptible,
3318                                       dst_lo_offset,
3319                                       dst_hi_offset,
3320                                       VM_BEHAVIOR_SEQUENTIAL,
3321                                       &dst_prot,
3322                                       &dst_page,
3323                                       &dst_top_page,
3324                                       (int *)0,
3325                                       &error,
3326                                       dst_map->no_zero_fill,
3327                                       FALSE, NULL, 0)) {
3328                 case VM_FAULT_SUCCESS:
3329                         break;
3330                 case VM_FAULT_RETRY:
3331                         goto RetryDestinationFault;
3332                 case VM_FAULT_MEMORY_SHORTAGE:
3333                         if (vm_page_wait(interruptible))
3334                                 goto RetryDestinationFault;
3335                         /* fall thru */
3336                 case VM_FAULT_INTERRUPTED:
3337                         RETURN(MACH_SEND_INTERRUPTED);
3338                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3339                         vm_page_more_fictitious();
3340                         goto RetryDestinationFault;
3341                 case VM_FAULT_MEMORY_ERROR:
3342                         if (error)
3343                                 return (error);
3344                         else
3345                                 return(KERN_MEMORY_ERROR);
3346                 }
3347                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3348
3349                 old_copy_object = dst_page->object->copy;
3350
3351                 /*
3352                  * There exists the possiblity that the source and
3353                  * destination page are the same.  But we can't
3354                  * easily determine that now.  If they are the
3355                  * same, the call to vm_fault_page() for the
3356                  * destination page will deadlock.  To prevent this we
3357                  * wire the page so we can drop busy without having
3358                  * the page daemon steal the page.  We clean up the
3359                  * top page  but keep the paging reference on the object
3360                  * holding the dest page so it doesn't go away.
3361                  */
3362
3363                 vm_page_lock_queues();
3364                 vm_page_wire(dst_page);
3365                 vm_page_unlock_queues();
3366                 PAGE_WAKEUP_DONE(dst_page);
3367                 vm_object_unlock(dst_page->object);
3368
3369                 if (dst_top_page != VM_PAGE_NULL) {
3370                         vm_object_lock(dst_object);
3371                         VM_PAGE_FREE(dst_top_page);
3372                         vm_object_paging_end(dst_object);
3373                         vm_object_unlock(dst_object);
3374                 }
3375
3376         RetrySourceFault: ;
3377
3378                 if (src_object == VM_OBJECT_NULL) {
3379                         /*
3380                          *      No source object.  We will just
3381                          *      zero-fill the page in dst_object.
3382                          */
3383                         src_page = VM_PAGE_NULL;
3384                         result_page = VM_PAGE_NULL;
3385                 } else {
3386                         vm_object_lock(src_object);
3387                         src_page = vm_page_lookup(src_object,
3388                                                   trunc_page_64(src_offset));
3389                         if (src_page == dst_page) {
3390                                 src_prot = dst_prot;
3391                                 result_page = VM_PAGE_NULL;
3392                         } else {
3393                                 src_prot = VM_PROT_READ;
3394                                 vm_object_paging_begin(src_object);
3395
3396                                 XPR(XPR_VM_FAULT,
3397                                         "vm_fault_copy(2) -> vm_fault_page\n",
3398                                         0,0,0,0,0);
3399                                 switch (vm_fault_page(src_object,
3400                                                       trunc_page_64(src_offset),
3401                                                       VM_PROT_READ,
3402                                                       FALSE,
3403                                                       interruptible,
3404                                                       src_lo_offset,
3405                                                       src_hi_offset,
3406                                                       VM_BEHAVIOR_SEQUENTIAL,
3407                                                       &src_prot,
3408                                                       &result_page,
3409                                                       &src_top_page,
3410                                                       (int *)0,
3411                                                       &error,
3412                                                       FALSE,
3413                                                       FALSE, NULL, 0)) {
3414
3415                                 case VM_FAULT_SUCCESS:
3416                                         break;
3417                                 case VM_FAULT_RETRY:
3418                                         goto RetrySourceFault;
3419                                 case VM_FAULT_MEMORY_SHORTAGE:
3420                                         if (vm_page_wait(interruptible))
3421                                                 goto RetrySourceFault;
3422                                         /* fall thru */
3423                                 case VM_FAULT_INTERRUPTED:
3424                                         vm_fault_copy_dst_cleanup(dst_page);
3425                                         RETURN(MACH_SEND_INTERRUPTED);
3426                                 case VM_FAULT_FICTITIOUS_SHORTAGE:
3427                                         vm_page_more_fictitious();
3428                                         goto RetrySourceFault;
3429                                 case VM_FAULT_MEMORY_ERROR:
3430                                         vm_fault_copy_dst_cleanup(dst_page);
3431                                         if (error)
3432                                                 return (error);
3433                                         else
3434                                                 return(KERN_MEMORY_ERROR);
3435                                 }
3436
3437
3438                                 assert((src_top_page == VM_PAGE_NULL) ==
3439                                        (result_page->object == src_object));
3440                         }
3441                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3442                         vm_object_unlock(result_page->object);
3443                 }
3444
3445                 if (!vm_map_verify(dst_map, dst_version)) {
3446                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3447                                 vm_fault_copy_cleanup(result_page, src_top_page);
3448                         vm_fault_copy_dst_cleanup(dst_page);
3449                         break;
3450                 }
3451
3452                 vm_object_lock(dst_page->object);
3453
3454                 if (dst_page->object->copy != old_copy_object) {
3455                         vm_object_unlock(dst_page->object);
3456                         vm_map_verify_done(dst_map, dst_version);
3457                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
3458                                 vm_fault_copy_cleanup(result_page, src_top_page);
3459                         vm_fault_copy_dst_cleanup(dst_page);
3460                         break;
3461                 }
3462                 vm_object_unlock(dst_page->object);
3463
3464                 /*
3465                  *      Copy the page, and note that it is dirty
3466                  *      immediately.
3467                  */
3468
3469                 if (!page_aligned(src_offset) ||
3470                         !page_aligned(dst_offset) ||
3471                         !page_aligned(amount_left)) {
3472
3473                         vm_object_offset_t      src_po,
3474                                                 dst_po;
3475
3476                         src_po = src_offset - trunc_page_64(src_offset);
3477                         dst_po = dst_offset - trunc_page_64(dst_offset);
3478
3479                         if (dst_po > src_po) {
3480                                 part_size = PAGE_SIZE - dst_po;
3481                         } else {
3482                                 part_size = PAGE_SIZE - src_po;
3483                         }
3484                         if (part_size > (amount_left)){
3485                                 part_size = amount_left;
3486                         }
3487
3488                         if (result_page == VM_PAGE_NULL) {
3489                                 vm_page_part_zero_fill(dst_page,
3490                                                         dst_po, part_size);
3491                         } else {
3492                                 vm_page_part_copy(result_page, src_po,
3493                                         dst_page, dst_po, part_size);
3494                                 if(!dst_page->dirty){
3495                                         vm_object_lock(dst_object);
3496                                         dst_page->dirty = TRUE;
3497                                         vm_object_unlock(dst_page->object);
3498                                 }
3499
3500                         }
3501                 } else {
3502                         part_size = PAGE_SIZE;
3503
3504                         if (result_page == VM_PAGE_NULL)
3505                                 vm_page_zero_fill(dst_page);
3506                         else{
3507                                 vm_page_copy(result_page, dst_page);
3508                                 if(!dst_page->dirty){
3509                                         vm_object_lock(dst_object);
3510                                         dst_page->dirty = TRUE;
3511                                         vm_object_unlock(dst_page->object);
3512                                 }
3513                         }
3514
3515                 }
3516
3517                 /*
3518                  *      Unlock everything, and return
3519                  */
3520
3521                 vm_map_verify_done(dst_map, dst_version);
3522
3523                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3524                         vm_fault_copy_cleanup(result_page, src_top_page);
3525                 vm_fault_copy_dst_cleanup(dst_page);
3526
3527                 amount_left -= part_size;
3528                 src_offset += part_size;
3529                 dst_offset += part_size;
3530         } while (amount_left > 0);
3531
3532         RETURN(KERN_SUCCESS);
3533 #undef  RETURN
3534
3535         /*NOTREACHED*/
3536 }
3537
3538 #ifdef  notdef
3539
3540 /*
3541  *      Routine:        vm_fault_page_overwrite
3542  *
3543  *      Description:
3544  *              A form of vm_fault_page that assumes that the
3545  *              resulting page will be overwritten in its entirety,
3546  *              making it unnecessary to obtain the correct *contents*
3547  *              of the page.
3548  *
3549  *      Implementation:
3550  *              XXX Untested.  Also unused.  Eventually, this technology
3551  *              could be used in vm_fault_copy() to advantage.
3552  */
3553 vm_fault_return_t
3554 vm_fault_page_overwrite(
3555         register
3556         vm_object_t             dst_object,
3557         vm_object_offset_t      dst_offset,
3558         vm_page_t               *result_page)   /* OUT */
3559 {
3560         register
3561         vm_page_t       dst_page;
3562         kern_return_t   wait_result;
3563
3564 #define interruptible   THREAD_UNINT    /* XXX */
3565
3566         while (TRUE) {
3567                 /*
3568                  *      Look for a page at this offset
3569                  */
3570
3571                 while ((dst_page = vm_page_lookup(dst_object, dst_offset))
3572                                  == VM_PAGE_NULL) {
3573                         /*
3574                          *      No page, no problem... just allocate one.
3575                          */
3576
3577                         dst_page = vm_page_alloc(dst_object, dst_offset);
3578                         if (dst_page == VM_PAGE_NULL) {
3579                                 vm_object_unlock(dst_object);
3580                                 VM_PAGE_WAIT();
3581                                 vm_object_lock(dst_object);
3582                                 continue;
3583                         }
3584
3585                         /*
3586                          *      Pretend that the memory manager
3587                          *      write-protected the page.
3588                          *
3589                          *      Note that we will be asking for write
3590                          *      permission without asking for the data
3591                          *      first.
3592                          */
3593
3594                         dst_page->overwriting = TRUE;
3595                         dst_page->page_lock = VM_PROT_WRITE;
3596                         dst_page->absent = TRUE;
3597                         dst_page->unusual = TRUE;
3598                         dst_object->absent_count++;
3599
3600                         break;
3601
3602                         /*
3603                          *      When we bail out, we might have to throw
3604                          *      away the page created here.
3605                          */
3606
3607 #define DISCARD_PAGE                                            \
3608         MACRO_BEGIN                                             \
3609         vm_object_lock(dst_object);                             \
3610         dst_page = vm_page_lookup(dst_object, dst_offset);      \
3611         if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \
3612                 VM_PAGE_FREE(dst_page);                         \
3613         vm_object_unlock(dst_object);                           \
3614         MACRO_END
3615                 }
3616
3617                 /*
3618                  *      If the page is write-protected...
3619                  */
3620
3621                 if (dst_page->page_lock & VM_PROT_WRITE) {
3622                         /*
3623                          *      ... and an unlock request hasn't been sent
3624                          */
3625
3626                         if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) {
3627                                 vm_prot_t       u;
3628                                 kern_return_t   rc;
3629
3630                                 /*
3631                                  *      ... then send one now.
3632                                  */
3633
3634                                 if (!dst_object->pager_ready) {
3635                                         vm_object_assert_wait(dst_object,
3636                                                 VM_OBJECT_EVENT_PAGER_READY,
3637                                                 interruptible);
3638                                         vm_object_unlock(dst_object);
3639                                         wait_result = thread_block((void (*)(void))0);
3640                                         if (wait_result != THREAD_AWAKENED) {
3641                                                 DISCARD_PAGE;
3642                                                 return(VM_FAULT_INTERRUPTED);
3643                                         }
3644                                         continue;
3645                                 }
3646
3647                                 u = dst_page->unlock_request |= VM_PROT_WRITE;
3648                                 vm_object_unlock(dst_object);
3649
3650                                 if ((rc = memory_object_data_unlock(
3651                                                 dst_object->pager,
3652                                                 dst_offset + dst_object->paging_offset,
3653                                                 PAGE_SIZE,
3654                                                 u)) != KERN_SUCCESS) {
3655                                         if (vm_fault_debug)
3656                                             printf("vm_object_overwrite: memory_object_data_unlock failed\n");
3657                                         DISCARD_PAGE;
3658                                         return((rc == MACH_SEND_INTERRUPTED) ?
3659                                                 VM_FAULT_INTERRUPTED :
3660                                                 VM_FAULT_MEMORY_ERROR);
3661                                 }
3662                                 vm_object_lock(dst_object);
3663                                 continue;
3664                         }
3665
3666                         /* ... fall through to wait below */
3667                 } else {
3668                         /*
3669                          *      If the page isn't being used for other
3670                          *      purposes, then we're done.
3671                          */
3672                         if ( ! (dst_page->busy || dst_page->absent ||
3673                                 dst_page->error || dst_page->restart) )
3674                                 break;
3675                 }
3676
3677                 PAGE_ASSERT_WAIT(dst_page, interruptible);
3678                 vm_object_unlock(dst_object);
3679                 wait_result = thread_block((void (*)(void))0);
3680                 if (wait_result != THREAD_AWAKENED) {
3681                         DISCARD_PAGE;
3682                         return(VM_FAULT_INTERRUPTED);
3683                 }
3684         }
3685
3686         *result_page = dst_page;
3687         return(VM_FAULT_SUCCESS);
3688
3689 #undef  interruptible
3690 #undef  DISCARD_PAGE
3691 }
3692
3693 #endif  /* notdef */
3694
3695 #if     VM_FAULT_CLASSIFY
3696 /*
3697  *      Temporary statistics gathering support.
3698  */
3699
3700 /*
3701  *      Statistics arrays:
3702  */
3703 #define VM_FAULT_TYPES_MAX      5
3704 #define VM_FAULT_LEVEL_MAX      8
3705
3706 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
3707
3708 #define VM_FAULT_TYPE_ZERO_FILL 0
3709 #define VM_FAULT_TYPE_MAP_IN    1
3710 #define VM_FAULT_TYPE_PAGER     2
3711 #define VM_FAULT_TYPE_COPY      3
3712 #define VM_FAULT_TYPE_OTHER     4
3713
3714
3715 void
3716 vm_fault_classify(vm_object_t           object,
3717                   vm_object_offset_t    offset,
3718                   vm_prot_t             fault_type)
3719 {
3720         int             type, level = 0;
3721         vm_page_t       m;
3722
3723         while (TRUE) {
3724                 m = vm_page_lookup(object, offset);
3725                 if (m != VM_PAGE_NULL) {
3726                         if (m->busy || m->error || m->restart || m->absent ||
3727                             fault_type & m->page_lock) {
3728                                 type = VM_FAULT_TYPE_OTHER;
3729                                 break;
3730                         }
3731                         if (((fault_type & VM_PROT_WRITE) == 0) ||
3732                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
3733                                 type = VM_FAULT_TYPE_MAP_IN;
3734                                 break;
3735                         }
3736                         type = VM_FAULT_TYPE_COPY;
3737                         break;
3738                 }
3739                 else {
3740                         if (object->pager_created) {
3741                                 type = VM_FAULT_TYPE_PAGER;
3742                                 break;
3743                         }
3744                         if (object->shadow == VM_OBJECT_NULL) {
3745                                 type = VM_FAULT_TYPE_ZERO_FILL;
3746                                 break;
3747                         }
3748
3749                         offset += object->shadow_offset;
3750                         object = object->shadow;
3751                         level++;
3752                         continue;
3753                 }
3754         }
3755
3756         if (level > VM_FAULT_LEVEL_MAX)
3757                 level = VM_FAULT_LEVEL_MAX;
3758
3759         vm_fault_stats[type][level] += 1;
3760
3761         return;
3762 }
3763
3764 /* cleanup routine to call from debugger */
3765
3766 void
3767 vm_fault_classify_init(void)
3768 {
3769         int type, level;
3770
3771         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
3772                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
3773                         vm_fault_stats[type][level] = 0;
3774                 }
3775         }
3776
3777         return;
3778 }
3779 #endif  /* VM_FAULT_CLASSIFY */