osfmk/vm/vm_object.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /*
  31  * @OSF_COPYRIGHT@
  32  */
  33 /*
  34  * Mach Operating System
  35  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  36  * All Rights Reserved.
  37  *
  38  * Permission to use, copy, modify and distribute this software and its
  39  * documentation is hereby granted, provided that both the copyright
  40  * notice and this permission notice appear in all copies of the
  41  * software, derivative works or modified versions, and any portions
  42  * thereof, and that both notices appear in supporting documentation.
  43  *
  44  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  45  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  46  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  47  *
  48  * Carnegie Mellon requests users of this software to return to
  49  *
  50  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  51  *  School of Computer Science
  52  *  Carnegie Mellon University
  53  *  Pittsburgh PA 15213-3890
  54  *
  55  * any improvements or extensions that they make and grant Carnegie Mellon
  56  * the rights to redistribute these changes.
  57  */
  58 /*
  59  */
  60 /*
  61  *      File:   vm/vm_object.c
  62  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  63  *
  64  *      Virtual memory object module.
  65  */
  66
  67 #include <mach_pagemap.h>
  68 #include <task_swapper.h>
  69
  70 #include <mach/mach_types.h>
  71 #include <mach/memory_object.h>
  72 #include <mach/memory_object_default.h>
  73 #include <mach/memory_object_control_server.h>
  74 #include <mach/vm_param.h>
  75
  76 #include <ipc/ipc_types.h>
  77 #include <ipc/ipc_port.h>
  78
  79 #include <kern/kern_types.h>
  80 #include <kern/assert.h>
  81 #include <kern/lock.h>
  82 #include <kern/queue.h>
  83 #include <kern/xpr.h>
  84 #include <kern/zalloc.h>
  85 #include <kern/host.h>
  86 #include <kern/host_statistics.h>
  87 #include <kern/processor.h>
  88 #include <kern/misc_protos.h>
  89
  90 #include <vm/memory_object.h>
  91 #include <vm/vm_fault.h>
  92 #include <vm/vm_map.h>
  93 #include <vm/vm_object.h>
  94 #include <vm/vm_page.h>
  95 #include <vm/vm_pageout.h>
  96 #include <vm/vm_protos.h>
  97
  98 /*
  99  *      Virtual memory objects maintain the actual data
 100  *      associated with allocated virtual memory.  A given
 101  *      page of memory exists within exactly one object.
 102  *
 103  *      An object is only deallocated when all "references"
 104  *      are given up.
 105  *
 106  *      Associated with each object is a list of all resident
 107  *      memory pages belonging to that object; this list is
 108  *      maintained by the "vm_page" module, but locked by the object's
 109  *      lock.
 110  *
 111  *      Each object also records the memory object reference
 112  *      that is used by the kernel to request and write
 113  *      back data (the memory object, field "pager"), etc...
 114  *
 115  *      Virtual memory objects are allocated to provide
 116  *      zero-filled memory (vm_allocate) or map a user-defined
 117  *      memory object into a virtual address space (vm_map).
 118  *
 119  *      Virtual memory objects that refer to a user-defined
 120  *      memory object are called "permanent", because all changes
 121  *      made in virtual memory are reflected back to the
 122  *      memory manager, which may then store it permanently.
 123  *      Other virtual memory objects are called "temporary",
 124  *      meaning that changes need be written back only when
 125  *      necessary to reclaim pages, and that storage associated
 126  *      with the object can be discarded once it is no longer
 127  *      mapped.
 128  *
 129  *      A permanent memory object may be mapped into more
 130  *      than one virtual address space.  Moreover, two threads
 131  *      may attempt to make the first mapping of a memory
 132  *      object concurrently.  Only one thread is allowed to
 133  *      complete this mapping; all others wait for the
 134  *      "pager_initialized" field is asserted, indicating
 135  *      that the first thread has initialized all of the
 136  *      necessary fields in the virtual memory object structure.
 137  *
 138  *      The kernel relies on a *default memory manager* to
 139  *      provide backing storage for the zero-filled virtual
 140  *      memory objects.  The pager memory objects associated
 141  *      with these temporary virtual memory objects are only
 142  *      requested from the default memory manager when it
 143  *      becomes necessary.  Virtual memory objects
 144  *      that depend on the default memory manager are called
 145  *      "internal".  The "pager_created" field is provided to
 146  *      indicate whether these ports have ever been allocated.
 147  *
 148  *      The kernel may also create virtual memory objects to
 149  *      hold changed pages after a copy-on-write operation.
 150  *      In this case, the virtual memory object (and its
 151  *      backing storage -- its memory object) only contain
 152  *      those pages that have been changed.  The "shadow"
 153  *      field refers to the virtual memory object that contains
 154  *      the remainder of the contents.  The "shadow_offset"
 155  *      field indicates where in the "shadow" these contents begin.
 156  *      The "copy" field refers to a virtual memory object
 157  *      to which changed pages must be copied before changing
 158  *      this object, in order to implement another form
 159  *      of copy-on-write optimization.
 160  *
 161  *      The virtual memory object structure also records
 162  *      the attributes associated with its memory object.
 163  *      The "pager_ready", "can_persist" and "copy_strategy"
 164  *      fields represent those attributes.  The "cached_list"
 165  *      field is used in the implementation of the persistence
 166  *      attribute.
 167  *
 168  * ZZZ Continue this comment.
 169  */
 170
 171 /* Forward declarations for internal functions. */
 172 static kern_return_t    vm_object_terminate(
 173                                 vm_object_t     object);
 174
 175 extern void             vm_object_remove(
 176                                 vm_object_t     object);
 177
 178 static vm_object_t      vm_object_cache_trim(
 179                                 boolean_t called_from_vm_object_deallocate);
 180
 181 static void             vm_object_deactivate_all_pages(
 182                                 vm_object_t     object);
 183
 184 static kern_return_t    vm_object_copy_call(
 185                                 vm_object_t             src_object,
 186                                 vm_object_offset_t      src_offset,
 187                                 vm_object_size_t        size,
 188                                 vm_object_t             *_result_object);
 189
 190 static void             vm_object_do_collapse(
 191                                 vm_object_t     object,
 192                                 vm_object_t     backing_object);
 193
 194 static void             vm_object_do_bypass(
 195                                 vm_object_t     object,
 196                                 vm_object_t     backing_object);
 197
 198 static void             vm_object_release_pager(
 199                                 memory_object_t pager);
 200
 201 static zone_t           vm_object_zone;         /* vm backing store zone */
 202
 203 /*
 204  *      All wired-down kernel memory belongs to a single virtual
 205  *      memory object (kernel_object) to avoid wasting data structures.
 206  */
 207 static struct vm_object                 kernel_object_store;
 208 vm_object_t                                             kernel_object;
 209
 210 /*
 211  *      The submap object is used as a placeholder for vm_map_submap
 212  *      operations.  The object is declared in vm_map.c because it
 213  *      is exported by the vm_map module.  The storage is declared
 214  *      here because it must be initialized here.
 215  */
 216 static struct vm_object                 vm_submap_object_store;
 217
 218 /*
 219  *      Virtual memory objects are initialized from
 220  *      a template (see vm_object_allocate).
 221  *
 222  *      When adding a new field to the virtual memory
 223  *      object structure, be sure to add initialization
 224  *      (see _vm_object_allocate()).
 225  */
 226 static struct vm_object                 vm_object_template;
 227
 228 /*
 229  *      Virtual memory objects that are not referenced by
 230  *      any address maps, but that are allowed to persist
 231  *      (an attribute specified by the associated memory manager),
 232  *      are kept in a queue (vm_object_cached_list).
 233  *
 234  *      When an object from this queue is referenced again,
 235  *      for example to make another address space mapping,
 236  *      it must be removed from the queue.  That is, the
 237  *      queue contains *only* objects with zero references.
 238  *
 239  *      The kernel may choose to terminate objects from this
 240  *      queue in order to reclaim storage.  The current policy
 241  *      is to permit a fixed maximum number of unreferenced
 242  *      objects (vm_object_cached_max).
 243  *
 244  *      A spin lock (accessed by routines
 245  *      vm_object_cache_{lock,lock_try,unlock}) governs the
 246  *      object cache.  It must be held when objects are
 247  *      added to or removed from the cache (in vm_object_terminate).
 248  *      The routines that acquire a reference to a virtual
 249  *      memory object based on one of the memory object ports
 250  *      must also lock the cache.
 251  *
 252  *      Ideally, the object cache should be more isolated
 253  *      from the reference mechanism, so that the lock need
 254  *      not be held to make simple references.
 255  */
 256 static queue_head_t     vm_object_cached_list;
 257 static int              vm_object_cached_count=0;
 258 static int              vm_object_cached_high;  /* highest # cached objects */
 259 static int              vm_object_cached_max = 512;     /* may be patched*/
 260
 261 static decl_mutex_data(,vm_object_cached_lock_data)
 262
 263 #define vm_object_cache_lock()          \
 264                 mutex_lock(&vm_object_cached_lock_data)
 265 #define vm_object_cache_lock_try()      \
 266                 mutex_try(&vm_object_cached_lock_data)
 267 #define vm_object_cache_unlock()        \
 268                 mutex_unlock(&vm_object_cached_lock_data)
 269
 270 #define VM_OBJECT_HASH_COUNT            1024
 271 static queue_head_t     vm_object_hashtable[VM_OBJECT_HASH_COUNT];
 272 static struct zone              *vm_object_hash_zone;
 273
 274 struct vm_object_hash_entry {
 275         queue_chain_t           hash_link;      /* hash chain link */
 276         memory_object_t pager;          /* pager we represent */
 277         vm_object_t             object;         /* corresponding object */
 278         boolean_t               waiting;        /* someone waiting for
 279                                                  * termination */
 280 };
 281
 282 typedef struct vm_object_hash_entry     *vm_object_hash_entry_t;
 283 #define VM_OBJECT_HASH_ENTRY_NULL       ((vm_object_hash_entry_t) 0)
 284
 285 #define VM_OBJECT_HASH_SHIFT    8
 286 #define vm_object_hash(pager) \
 287         ((((unsigned)pager) >> VM_OBJECT_HASH_SHIFT) % VM_OBJECT_HASH_COUNT)
 288
 289 void vm_object_hash_entry_free(
 290         vm_object_hash_entry_t  entry);
 291
 292 /*
 293  *      vm_object_hash_lookup looks up a pager in the hashtable
 294  *      and returns the corresponding entry, with optional removal.
 295  */
 296
 297 static vm_object_hash_entry_t
 298 vm_object_hash_lookup(
 299         memory_object_t pager,
 300         boolean_t       remove_entry)
 301 {
 302         register queue_t                bucket;
 303         register vm_object_hash_entry_t entry;
 304
 305         bucket = &vm_object_hashtable[vm_object_hash(pager)];
 306
 307         entry = (vm_object_hash_entry_t)queue_first(bucket);
 308         while (!queue_end(bucket, (queue_entry_t)entry)) {
 309                 if (entry->pager == pager && !remove_entry)
 310                         return(entry);
 311                 else if (entry->pager == pager) {
 312                         queue_remove(bucket, entry,
 313                                         vm_object_hash_entry_t, hash_link);
 314                         return(entry);
 315                 }
 316
 317                 entry = (vm_object_hash_entry_t)queue_next(&entry->hash_link);
 318         }
 319
 320         return(VM_OBJECT_HASH_ENTRY_NULL);
 321 }
 322
 323 /*
 324  *      vm_object_hash_enter enters the specified
 325  *      pager / cache object association in the hashtable.
 326  */
 327
 328 static void
 329 vm_object_hash_insert(
 330         vm_object_hash_entry_t  entry)
 331 {
 332         register queue_t                bucket;
 333
 334         bucket = &vm_object_hashtable[vm_object_hash(entry->pager)];
 335
 336         queue_enter(bucket, entry, vm_object_hash_entry_t, hash_link);
 337 }
 338
 339 static vm_object_hash_entry_t
 340 vm_object_hash_entry_alloc(
 341         memory_object_t pager)
 342 {
 343         vm_object_hash_entry_t  entry;
 344
 345         entry = (vm_object_hash_entry_t)zalloc(vm_object_hash_zone);
 346         entry->pager = pager;
 347         entry->object = VM_OBJECT_NULL;
 348         entry->waiting = FALSE;
 349
 350         return(entry);
 351 }
 352
 353 void
 354 vm_object_hash_entry_free(
 355         vm_object_hash_entry_t  entry)
 356 {
 357         zfree(vm_object_hash_zone, entry);
 358 }
 359
 360 /*
 361  *      vm_object_allocate:
 362  *
 363  *      Returns a new object with the given size.
 364  */
 365
 366 __private_extern__ void
 367 _vm_object_allocate(
 368         vm_object_size_t        size,
 369         vm_object_t             object)
 370 {
 371         XPR(XPR_VM_OBJECT,
 372                 "vm_object_allocate, object 0x%X size 0x%X\n",
 373                 (integer_t)object, size, 0,0,0);
 374
 375         *object = vm_object_template;
 376         queue_init(&object->memq);
 377         queue_init(&object->msr_q);
 378 #ifdef UPL_DEBUG
 379         queue_init(&object->uplq);
 380 #endif /* UPL_DEBUG */
 381         vm_object_lock_init(object);
 382         object->size = size;
 383 }
 384
 385 __private_extern__ vm_object_t
 386 vm_object_allocate(
 387         vm_object_size_t        size)
 388 {
 389         register vm_object_t object;
 390
 391         object = (vm_object_t) zalloc(vm_object_zone);
 392
 393 //      dbgLog(object, size, 0, 2);                     /* (TEST/DEBUG) */
 394
 395         if (object != VM_OBJECT_NULL)
 396                 _vm_object_allocate(size, object);
 397
 398         return object;
 399 }
 400
 401 /*
 402  *      vm_object_bootstrap:
 403  *
 404  *      Initialize the VM objects module.
 405  */
 406 __private_extern__ void
 407 vm_object_bootstrap(void)
 408 {
 409         register int    i;
 410
 411         vm_object_zone = zinit((vm_size_t) sizeof(struct vm_object),
 412                                 round_page_32(512*1024),
 413                                 round_page_32(12*1024),
 414                                 "vm objects");
 415
 416         queue_init(&vm_object_cached_list);
 417         mutex_init(&vm_object_cached_lock_data, 0);
 418
 419         vm_object_hash_zone =
 420                         zinit((vm_size_t) sizeof (struct vm_object_hash_entry),
 421                               round_page_32(512*1024),
 422                               round_page_32(12*1024),
 423                               "vm object hash entries");
 424
 425         for (i = 0; i < VM_OBJECT_HASH_COUNT; i++)
 426                 queue_init(&vm_object_hashtable[i]);
 427
 428         /*
 429          *      Fill in a template object, for quick initialization
 430          */
 431
 432         /* memq; Lock; init after allocation */
 433         vm_object_template.size = 0;
 434         vm_object_template.memq_hint = VM_PAGE_NULL;
 435         vm_object_template.ref_count = 1;
 436 #if     TASK_SWAPPER
 437         vm_object_template.res_count = 1;
 438 #endif  /* TASK_SWAPPER */
 439         vm_object_template.resident_page_count = 0;
 440         vm_object_template.copy = VM_OBJECT_NULL;
 441         vm_object_template.shadow = VM_OBJECT_NULL;
 442         vm_object_template.shadow_offset = (vm_object_offset_t) 0;
 443         vm_object_template.cow_hint = ~(vm_offset_t)0;
 444         vm_object_template.true_share = FALSE;
 445
 446         vm_object_template.pager = MEMORY_OBJECT_NULL;
 447         vm_object_template.paging_offset = 0;
 448         vm_object_template.pager_control = MEMORY_OBJECT_CONTROL_NULL;
 449         /* msr_q; init after allocation */
 450
 451         vm_object_template.copy_strategy = MEMORY_OBJECT_COPY_SYMMETRIC;
 452         vm_object_template.absent_count = 0;
 453         vm_object_template.paging_in_progress = 0;
 454
 455         /* Begin bitfields */
 456         vm_object_template.all_wanted = 0; /* all bits FALSE */
 457         vm_object_template.pager_created = FALSE;
 458         vm_object_template.pager_initialized = FALSE;
 459         vm_object_template.pager_ready = FALSE;
 460         vm_object_template.pager_trusted = FALSE;
 461         vm_object_template.can_persist = FALSE;
 462         vm_object_template.internal = TRUE;
 463         vm_object_template.temporary = TRUE;
 464         vm_object_template.private = FALSE;
 465         vm_object_template.pageout = FALSE;
 466         vm_object_template.alive = TRUE;
 467         vm_object_template.purgable = VM_OBJECT_NONPURGABLE;
 468         vm_object_template.silent_overwrite = FALSE;
 469         vm_object_template.advisory_pageout = FALSE;
 470         vm_object_template.shadowed = FALSE;
 471         vm_object_template.terminating = FALSE;
 472         vm_object_template.shadow_severed = FALSE;
 473         vm_object_template.phys_contiguous = FALSE;
 474         vm_object_template.nophyscache = FALSE;
 475         /* End bitfields */
 476
 477         /* cache bitfields */
 478         vm_object_template.wimg_bits = VM_WIMG_DEFAULT;
 479
 480         /* cached_list; init after allocation */
 481         vm_object_template.last_alloc = (vm_object_offset_t) 0;
 482         vm_object_template.cluster_size = 0;
 483 #if     MACH_PAGEMAP
 484         vm_object_template.existence_map = VM_EXTERNAL_NULL;
 485 #endif  /* MACH_PAGEMAP */
 486 #if     MACH_ASSERT
 487         vm_object_template.paging_object = VM_OBJECT_NULL;
 488 #endif  /* MACH_ASSERT */
 489
 490         /*
 491          *      Initialize the "kernel object"
 492          */
 493
 494         kernel_object = &kernel_object_store;
 495
 496 /*
 497  *      Note that in the following size specifications, we need to add 1 because
 498  *      VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size.
 499  */
 500
 501 #ifdef ppc
 502         _vm_object_allocate((vm_last_addr - VM_MIN_KERNEL_ADDRESS) + 1,
 503                         kernel_object);
 504 #else
 505         _vm_object_allocate((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) + 1,
 506                         kernel_object);
 507 #endif
 508         kernel_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 509
 510         /*
 511          *      Initialize the "submap object".  Make it as large as the
 512          *      kernel object so that no limit is imposed on submap sizes.
 513          */
 514
 515         vm_submap_object = &vm_submap_object_store;
 516 #ifdef ppc
 517         _vm_object_allocate((vm_last_addr - VM_MIN_KERNEL_ADDRESS) + 1,
 518                         vm_submap_object);
 519 #else
 520         _vm_object_allocate((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) + 1,
 521                         vm_submap_object);
 522 #endif
 523         vm_submap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 524
 525         /*
 526          * Create an "extra" reference to this object so that we never
 527          * try to deallocate it; zfree doesn't like to be called with
 528          * non-zone memory.
 529          */
 530         vm_object_reference(vm_submap_object);
 531
 532 #if     MACH_PAGEMAP
 533         vm_external_module_initialize();
 534 #endif  /* MACH_PAGEMAP */
 535 }
 536
 537 __private_extern__ void
 538 vm_object_init(void)
 539 {
 540         /*
 541          *      Finish initializing the kernel object.
 542          */
 543 }
 544
 545 /* remove the typedef below when emergency work-around is taken out */
 546 typedef struct vnode_pager {
 547         memory_object_t pager;
 548         memory_object_t pager_handle;   /* pager */
 549         memory_object_control_t         control_handle; /* memory object's control handle */
 550         void            *vnode_handle;  /* vnode handle */
 551 } *vnode_pager_t;
 552
 553 #define MIGHT_NOT_CACHE_SHADOWS         1
 554 #if     MIGHT_NOT_CACHE_SHADOWS
 555 static int cache_shadows = TRUE;
 556 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
 557
 558 /*
 559  *      vm_object_deallocate:
 560  *
 561  *      Release a reference to the specified object,
 562  *      gained either through a vm_object_allocate
 563  *      or a vm_object_reference call.  When all references
 564  *      are gone, storage associated with this object
 565  *      may be relinquished.
 566  *
 567  *      No object may be locked.
 568  */
 569 __private_extern__ void
 570 vm_object_deallocate(
 571         register vm_object_t    object)
 572 {
 573         boolean_t retry_cache_trim = FALSE;
 574         vm_object_t shadow = VM_OBJECT_NULL;
 575
 576 //      if(object)dbgLog(object, object->ref_count, object->can_persist, 3);    /* (TEST/DEBUG) */
 577 //      else dbgLog(object, 0, 0, 3);   /* (TEST/DEBUG) */
 578
 579
 580         while (object != VM_OBJECT_NULL) {
 581
 582                 /*
 583                  *      The cache holds a reference (uncounted) to
 584                  *      the object; we must lock it before removing
 585                  *      the object.
 586                  */
 587                 for (;;) {
 588                         vm_object_cache_lock();
 589
 590                         /*
 591                          * if we try to take a regular lock here
 592                          * we risk deadlocking against someone
 593                          * holding a lock on this object while
 594                          * trying to vm_object_deallocate a different
 595                          * object
 596                          */
 597                         if (vm_object_lock_try(object))
 598                                 break;
 599                         vm_object_cache_unlock();
 600                         mutex_pause();  /* wait a bit */
 601                 }
 602                 assert(object->ref_count > 0);
 603
 604                 /*
 605                  *      If the object has a named reference, and only
 606                  *      that reference would remain, inform the pager
 607                  *      about the last "mapping" reference going away.
 608                  */
 609                 if ((object->ref_count == 2)  && (object->named)) {
 610                         memory_object_t pager = object->pager;
 611
 612                         /* Notify the Pager that there are no */
 613                         /* more mappers for this object */
 614
 615                         if (pager != MEMORY_OBJECT_NULL) {
 616                                 vm_object_unlock(object);
 617                                 vm_object_cache_unlock();
 618
 619                                 memory_object_unmap(pager);
 620
 621                                 for (;;) {
 622                                         vm_object_cache_lock();
 623
 624                                         /*
 625                                          * if we try to take a regular lock here
 626                                          * we risk deadlocking against someone
 627                                          * holding a lock on this object while
 628                                          * trying to vm_object_deallocate a different
 629                                          * object
 630                                          */
 631                                         if (vm_object_lock_try(object))
 632                                                 break;
 633                                         vm_object_cache_unlock();
 634                                         mutex_pause();  /* wait a bit */
 635                                 }
 636                                 assert(object->ref_count > 0);
 637                         }
 638                 }
 639
 640                 /*
 641                  *      Lose the reference. If other references
 642                  *      remain, then we are done, unless we need
 643                  *      to retry a cache trim.
 644                  *      If it is the last reference, then keep it
 645                  *      until any pending initialization is completed.
 646                  */
 647
 648                 /* if the object is terminating, it cannot go into */
 649                 /* the cache and we obviously should not call      */
 650                 /* terminate again.  */
 651
 652                 if ((object->ref_count > 1) || object->terminating) {
 653                         object->ref_count--;
 654                         vm_object_res_deallocate(object);
 655                         vm_object_cache_unlock();
 656
 657                         if (object->ref_count == 1 &&
 658                             object->shadow != VM_OBJECT_NULL) {
 659                                 /*
 660                                  * There's only one reference left on this
 661                                  * VM object.  We can't tell if it's a valid
 662                                  * one (from a mapping for example) or if this
 663                                  * object is just part of a possibly stale and
 664                                  * useless shadow chain.
 665                                  * We would like to try and collapse it into
 666                                  * its parent, but we don't have any pointers
 667                                  * back to this parent object.
 668                                  * But we can try and collapse this object with
 669                                  * its own shadows, in case these are useless
 670                                  * too...
 671                                  * We can't bypass this object though, since we
 672                                  * don't know if this last reference on it is
 673                                  * meaningful or not.
 674                                  */
 675                                 vm_object_collapse(object, 0, FALSE);
 676                         }
 677
 678                         vm_object_unlock(object);
 679                         if (retry_cache_trim &&
 680                             ((object = vm_object_cache_trim(TRUE)) !=
 681                              VM_OBJECT_NULL)) {
 682                                 continue;
 683                         }
 684                         return;
 685                 }
 686
 687                 /*
 688                  *      We have to wait for initialization
 689                  *      before destroying or caching the object.
 690                  */
 691
 692                 if (object->pager_created && ! object->pager_initialized) {
 693                         assert(! object->can_persist);
 694                         vm_object_assert_wait(object,
 695                                               VM_OBJECT_EVENT_INITIALIZED,
 696                                               THREAD_UNINT);
 697                         vm_object_unlock(object);
 698                         vm_object_cache_unlock();
 699                         thread_block(THREAD_CONTINUE_NULL);
 700                         continue;
 701                 }
 702
 703                 /*
 704                  *      If this object can persist, then enter it in
 705                  *      the cache. Otherwise, terminate it.
 706                  *
 707                  *      NOTE:  Only permanent objects are cached, and
 708                  *      permanent objects cannot have shadows.  This
 709                  *      affects the residence counting logic in a minor
 710                  *      way (can do it in-line, mostly).
 711                  */
 712
 713                 if ((object->can_persist) && (object->alive)) {
 714                         /*
 715                          *      Now it is safe to decrement reference count,
 716                          *      and to return if reference count is > 0.
 717                          */
 718                         if (--object->ref_count > 0) {
 719                                 vm_object_res_deallocate(object);
 720                                 vm_object_unlock(object);
 721                                 vm_object_cache_unlock();
 722                                 if (retry_cache_trim &&
 723                                     ((object = vm_object_cache_trim(TRUE)) !=
 724                                      VM_OBJECT_NULL)) {
 725                                         continue;
 726                                 }
 727                                 return;
 728                         }
 729
 730 #if     MIGHT_NOT_CACHE_SHADOWS
 731                         /*
 732                          *      Remove shadow now if we don't
 733                          *      want to cache shadows.
 734                          */
 735                         if (! cache_shadows) {
 736                                 shadow = object->shadow;
 737                                 object->shadow = VM_OBJECT_NULL;
 738                         }
 739 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
 740
 741                         /*
 742                          *      Enter the object onto the queue of
 743                          *      cached objects, and deactivate
 744                          *      all of its pages.
 745                          */
 746                         assert(object->shadow == VM_OBJECT_NULL);
 747                         VM_OBJ_RES_DECR(object);
 748                         XPR(XPR_VM_OBJECT,
 749                       "vm_o_deallocate: adding %x to cache, queue = (%x, %x)\n",
 750                                 (integer_t)object,
 751                                 (integer_t)vm_object_cached_list.next,
 752                                 (integer_t)vm_object_cached_list.prev,0,0);
 753
 754                         vm_object_cached_count++;
 755                         if (vm_object_cached_count > vm_object_cached_high)
 756                                 vm_object_cached_high = vm_object_cached_count;
 757                         queue_enter(&vm_object_cached_list, object,
 758                                 vm_object_t, cached_list);
 759                         vm_object_cache_unlock();
 760                         vm_object_deactivate_all_pages(object);
 761                         vm_object_unlock(object);
 762
 763 #if     MIGHT_NOT_CACHE_SHADOWS
 764                         /*
 765                          *      If we have a shadow that we need
 766                          *      to deallocate, do so now, remembering
 767                          *      to trim the cache later.
 768                          */
 769                         if (! cache_shadows && shadow != VM_OBJECT_NULL) {
 770                                 object = shadow;
 771                                 retry_cache_trim = TRUE;
 772                                 continue;
 773                         }
 774 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
 775
 776                         /*
 777                          *      Trim the cache. If the cache trim
 778                          *      returns with a shadow for us to deallocate,
 779                          *      then remember to retry the cache trim
 780                          *      when we are done deallocating the shadow.
 781                          *      Otherwise, we are done.
 782                          */
 783
 784                         object = vm_object_cache_trim(TRUE);
 785                         if (object == VM_OBJECT_NULL) {
 786                                 return;
 787                         }
 788                         retry_cache_trim = TRUE;
 789
 790                 } else {
 791                         /*
 792                          *      This object is not cachable; terminate it.
 793                          */
 794                         XPR(XPR_VM_OBJECT,
 795          "vm_o_deallocate: !cacheable 0x%X res %d paging_ops %d thread 0x%p ref %d\n",
 796                             (integer_t)object, object->resident_page_count,
 797                             object->paging_in_progress,
 798                             (void *)current_thread(),object->ref_count);
 799
 800                         VM_OBJ_RES_DECR(object);        /* XXX ? */
 801                         /*
 802                          *      Terminate this object. If it had a shadow,
 803                          *      then deallocate it; otherwise, if we need
 804                          *      to retry a cache trim, do so now; otherwise,
 805                          *      we are done. "pageout" objects have a shadow,
 806                          *      but maintain a "paging reference" rather than
 807                          *      a normal reference.
 808                          */
 809                         shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
 810                         if(vm_object_terminate(object) != KERN_SUCCESS) {
 811                                 return;
 812                         }
 813                         if (shadow != VM_OBJECT_NULL) {
 814                                 object = shadow;
 815                                 continue;
 816                         }
 817                         if (retry_cache_trim &&
 818                             ((object = vm_object_cache_trim(TRUE)) !=
 819                              VM_OBJECT_NULL)) {
 820                                 continue;
 821                         }
 822                         return;
 823                 }
 824         }
 825         assert(! retry_cache_trim);
 826 }
 827
 828 /*
 829  *      Check to see whether we really need to trim
 830  *      down the cache. If so, remove an object from
 831  *      the cache, terminate it, and repeat.
 832  *
 833  *      Called with, and returns with, cache lock unlocked.
 834  */
 835 vm_object_t
 836 vm_object_cache_trim(
 837         boolean_t called_from_vm_object_deallocate)
 838 {
 839         register vm_object_t object = VM_OBJECT_NULL;
 840         vm_object_t shadow;
 841
 842         for (;;) {
 843
 844                 /*
 845                  *      If we no longer need to trim the cache,
 846                  *      then we are done.
 847                  */
 848
 849                 vm_object_cache_lock();
 850                 if (vm_object_cached_count <= vm_object_cached_max) {
 851                         vm_object_cache_unlock();
 852                         return VM_OBJECT_NULL;
 853                 }
 854
 855                 /*
 856                  *      We must trim down the cache, so remove
 857                  *      the first object in the cache.
 858                  */
 859                 XPR(XPR_VM_OBJECT,
 860                 "vm_object_cache_trim: removing from front of cache (%x, %x)\n",
 861                         (integer_t)vm_object_cached_list.next,
 862                         (integer_t)vm_object_cached_list.prev, 0, 0, 0);
 863
 864                 object = (vm_object_t) queue_first(&vm_object_cached_list);
 865                 if(object == (vm_object_t) &vm_object_cached_list) {
 866                         /* something's wrong with the calling parameter or */
 867                         /* the value of vm_object_cached_count, just fix   */
 868                         /* and return */
 869                         if(vm_object_cached_max < 0)
 870                                 vm_object_cached_max = 0;
 871                         vm_object_cached_count = 0;
 872                         vm_object_cache_unlock();
 873                         return VM_OBJECT_NULL;
 874                 }
 875                 vm_object_lock(object);
 876                 queue_remove(&vm_object_cached_list, object, vm_object_t,
 877                              cached_list);
 878                 vm_object_cached_count--;
 879
 880                 /*
 881                  *      Since this object is in the cache, we know
 882                  *      that it is initialized and has no references.
 883                  *      Take a reference to avoid recursive deallocations.
 884                  */
 885
 886                 assert(object->pager_initialized);
 887                 assert(object->ref_count == 0);
 888                 object->ref_count++;
 889
 890                 /*
 891                  *      Terminate the object.
 892                  *      If the object had a shadow, we let vm_object_deallocate
 893                  *      deallocate it. "pageout" objects have a shadow, but
 894                  *      maintain a "paging reference" rather than a normal
 895                  *      reference.
 896                  *      (We are careful here to limit recursion.)
 897                  */
 898                 shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
 899                 if(vm_object_terminate(object) != KERN_SUCCESS)
 900                         continue;
 901                 if (shadow != VM_OBJECT_NULL) {
 902                         if (called_from_vm_object_deallocate) {
 903                                 return shadow;
 904                         } else {
 905                                 vm_object_deallocate(shadow);
 906                         }
 907                 }
 908         }
 909 }
 910
 911 boolean_t       vm_object_terminate_remove_all = FALSE;
 912
 913 /*
 914  *      Routine:        vm_object_terminate
 915  *      Purpose:
 916  *              Free all resources associated with a vm_object.
 917  *      In/out conditions:
 918  *              Upon entry, the object must be locked,
 919  *              and the object must have exactly one reference.
 920  *
 921  *              The shadow object reference is left alone.
 922  *
 923  *              The object must be unlocked if its found that pages
 924  *              must be flushed to a backing object.  If someone
 925  *              manages to map the object while it is being flushed
 926  *              the object is returned unlocked and unchanged.  Otherwise,
 927  *              upon exit, the cache will be unlocked, and the
 928  *              object will cease to exist.
 929  */
 930 static kern_return_t
 931 vm_object_terminate(
 932         register vm_object_t    object)
 933 {
 934         memory_object_t         pager;
 935         register vm_page_t      p;
 936         vm_object_t             shadow_object;
 937
 938         XPR(XPR_VM_OBJECT, "vm_object_terminate, object 0x%X ref %d\n",
 939                 (integer_t)object, object->ref_count, 0, 0, 0);
 940
 941         if (!object->pageout && (!object->temporary || object->can_persist)
 942                         && (object->pager != NULL || object->shadow_severed)) {
 943            vm_object_cache_unlock();
 944            while (!queue_empty(&object->memq)) {
 945                 /*
 946                  * Clear pager_trusted bit so that the pages get yanked
 947                  * out of the object instead of cleaned in place.  This
 948                  * prevents a deadlock in XMM and makes more sense anyway.
 949                  */
 950                 object->pager_trusted = FALSE;
 951
 952                 p = (vm_page_t) queue_first(&object->memq);
 953
 954                 VM_PAGE_CHECK(p);
 955
 956                 if (p->busy || p->cleaning) {
 957                         if(p->cleaning || p->absent) {
 958                                 vm_object_paging_wait(object, THREAD_UNINT);
 959                                 continue;
 960                         } else {
 961                            panic("vm_object_terminate.3 0x%x 0x%x", object, p);
 962                         }
 963                 }
 964
 965                 vm_page_lock_queues();
 966                 p->busy = TRUE;
 967                 VM_PAGE_QUEUES_REMOVE(p);
 968                 vm_page_unlock_queues();
 969
 970                 if (p->absent || p->private) {
 971
 972                         /*
 973                          *      For private pages, VM_PAGE_FREE just
 974                          *      leaves the page structure around for
 975                          *      its owner to clean up.  For absent
 976                          *      pages, the structure is returned to
 977                          *      the appropriate pool.
 978                          */
 979
 980                         goto free_page;
 981                 }
 982
 983                 if (p->fictitious)
 984                         panic("vm_object_terminate.4 0x%x 0x%x", object, p);
 985
 986                 if (!p->dirty)
 987                         p->dirty = pmap_is_modified(p->phys_page);
 988
 989                 if ((p->dirty || p->precious) && !p->error && object->alive) {
 990                         vm_pageout_cluster(p); /* flush page */
 991                         vm_object_paging_wait(object, THREAD_UNINT);
 992                         XPR(XPR_VM_OBJECT,
 993                             "vm_object_terminate restart, object 0x%X ref %d\n",
 994                             (integer_t)object, object->ref_count, 0, 0, 0);
 995                 } else {
 996                     free_page:
 997                         VM_PAGE_FREE(p);
 998                 }
 999            }
1000            vm_object_unlock(object);
1001            vm_object_cache_lock();
1002            vm_object_lock(object);
1003         }
1004
1005         /*
1006          *      Make sure the object isn't already being terminated
1007          */
1008         if(object->terminating) {
1009                 object->ref_count -= 1;
1010                 assert(object->ref_count > 0);
1011                 vm_object_cache_unlock();
1012                 vm_object_unlock(object);
1013                 return KERN_FAILURE;
1014         }
1015
1016         /*
1017          * Did somebody get a reference to the object while we were
1018          * cleaning it?
1019          */
1020         if(object->ref_count != 1) {
1021                 object->ref_count -= 1;
1022                 assert(object->ref_count > 0);
1023                 vm_object_res_deallocate(object);
1024                 vm_object_cache_unlock();
1025                 vm_object_unlock(object);
1026                 return KERN_FAILURE;
1027         }
1028
1029         /*
1030          *      Make sure no one can look us up now.
1031          */
1032
1033         object->terminating = TRUE;
1034         object->alive = FALSE;
1035         vm_object_remove(object);
1036
1037         /*
1038          *      Detach the object from its shadow if we are the shadow's
1039          *      copy. The reference we hold on the shadow must be dropped
1040          *      by our caller.
1041          */
1042         if (((shadow_object = object->shadow) != VM_OBJECT_NULL) &&
1043             !(object->pageout)) {
1044                 vm_object_lock(shadow_object);
1045                 if (shadow_object->copy == object)
1046                         shadow_object->copy = VM_OBJECT_NULL;
1047                 vm_object_unlock(shadow_object);
1048         }
1049
1050         /*
1051          *      The pageout daemon might be playing with our pages.
1052          *      Now that the object is dead, it won't touch any more
1053          *      pages, but some pages might already be on their way out.
1054          *      Hence, we wait until the active paging activities have ceased
1055          *      before we break the association with the pager itself.
1056          */
1057         while (object->paging_in_progress != 0) {
1058                 vm_object_cache_unlock();
1059                 vm_object_wait(object,
1060                                VM_OBJECT_EVENT_PAGING_IN_PROGRESS,
1061                                THREAD_UNINT);
1062                 vm_object_cache_lock();
1063                 vm_object_lock(object);
1064         }
1065
1066         pager = object->pager;
1067         object->pager = MEMORY_OBJECT_NULL;
1068
1069         if (pager != MEMORY_OBJECT_NULL)
1070                 memory_object_control_disable(object->pager_control);
1071         vm_object_cache_unlock();
1072
1073         object->ref_count--;
1074 #if     TASK_SWAPPER
1075         assert(object->res_count == 0);
1076 #endif  /* TASK_SWAPPER */
1077
1078         assert (object->ref_count == 0);
1079
1080         /*
1081          *      Clean or free the pages, as appropriate.
1082          *      It is possible for us to find busy/absent pages,
1083          *      if some faults on this object were aborted.
1084          */
1085         if (object->pageout) {
1086                 assert(shadow_object != VM_OBJECT_NULL);
1087                 assert(shadow_object == object->shadow);
1088
1089                 vm_pageout_object_terminate(object);
1090
1091         } else if ((object->temporary && !object->can_persist) ||
1092                    (pager == MEMORY_OBJECT_NULL)) {
1093                 while (!queue_empty(&object->memq)) {
1094                         p = (vm_page_t) queue_first(&object->memq);
1095
1096                         VM_PAGE_CHECK(p);
1097                         VM_PAGE_FREE(p);
1098                 }
1099         } else if (!queue_empty(&object->memq)) {
1100                 panic("vm_object_terminate: queue just emptied isn't");
1101         }
1102
1103         assert(object->paging_in_progress == 0);
1104         assert(object->ref_count == 0);
1105
1106         /*
1107          * If the pager has not already been released by
1108          * vm_object_destroy, we need to terminate it and
1109          * release our reference to it here.
1110          */
1111         if (pager != MEMORY_OBJECT_NULL) {
1112                 vm_object_unlock(object);
1113                 vm_object_release_pager(pager);
1114                 vm_object_lock(object);
1115         }
1116
1117         /* kick off anyone waiting on terminating */
1118         object->terminating = FALSE;
1119         vm_object_paging_begin(object);
1120         vm_object_paging_end(object);
1121         vm_object_unlock(object);
1122
1123 #if     MACH_PAGEMAP
1124         vm_external_destroy(object->existence_map, object->size);
1125 #endif  /* MACH_PAGEMAP */
1126
1127         /*
1128          *      Free the space for the object.
1129          */
1130         zfree(vm_object_zone, object);
1131         return KERN_SUCCESS;
1132 }
1133
1134 /*
1135  *      Routine:        vm_object_pager_wakeup
1136  *      Purpose:        Wake up anyone waiting for termination of a pager.
1137  */
1138
1139 static void
1140 vm_object_pager_wakeup(
1141         memory_object_t pager)
1142 {
1143         vm_object_hash_entry_t  entry;
1144         boolean_t               waiting = FALSE;
1145
1146         /*
1147          *      If anyone was waiting for the memory_object_terminate
1148          *      to be queued, wake them up now.
1149          */
1150         vm_object_cache_lock();
1151         entry = vm_object_hash_lookup(pager, TRUE);
1152         if (entry != VM_OBJECT_HASH_ENTRY_NULL)
1153                 waiting = entry->waiting;
1154         vm_object_cache_unlock();
1155         if (entry != VM_OBJECT_HASH_ENTRY_NULL) {
1156                 if (waiting)
1157                         thread_wakeup((event_t) pager);
1158                 vm_object_hash_entry_free(entry);
1159         }
1160 }
1161
1162 /*
1163  *      Routine:        vm_object_release_pager
1164  *      Purpose:        Terminate the pager and, upon completion,
1165  *                      release our last reference to it.
1166  *                      just like memory_object_terminate, except
1167  *                      that we wake up anyone blocked in vm_object_enter
1168  *                      waiting for termination message to be queued
1169  *                      before calling memory_object_init.
1170  */
1171 static void
1172 vm_object_release_pager(
1173         memory_object_t pager)
1174 {
1175
1176         /*
1177          *      Terminate the pager.
1178          */
1179
1180         (void) memory_object_terminate(pager);
1181
1182         /*
1183          *      Wakeup anyone waiting for this terminate
1184          */
1185         vm_object_pager_wakeup(pager);
1186
1187         /*
1188          *      Release reference to pager.
1189          */
1190         memory_object_deallocate(pager);
1191 }
1192
1193 /*
1194  *      Routine:        vm_object_destroy
1195  *      Purpose:
1196  *              Shut down a VM object, despite the
1197  *              presence of address map (or other) references
1198  *              to the vm_object.
1199  */
1200 kern_return_t
1201 vm_object_destroy(
1202         vm_object_t             object,
1203         __unused kern_return_t          reason)
1204 {
1205         memory_object_t         old_pager;
1206
1207         if (object == VM_OBJECT_NULL)
1208                 return(KERN_SUCCESS);
1209
1210         /*
1211          *      Remove the pager association immediately.
1212          *
1213          *      This will prevent the memory manager from further
1214          *      meddling.  [If it wanted to flush data or make
1215          *      other changes, it should have done so before performing
1216          *      the destroy call.]
1217          */
1218
1219         vm_object_cache_lock();
1220         vm_object_lock(object);
1221         object->can_persist = FALSE;
1222         object->named = FALSE;
1223         object->alive = FALSE;
1224
1225         /*
1226          *      Rip out the pager from the vm_object now...
1227          */
1228
1229         vm_object_remove(object);
1230         old_pager = object->pager;
1231         object->pager = MEMORY_OBJECT_NULL;
1232         if (old_pager != MEMORY_OBJECT_NULL)
1233                 memory_object_control_disable(object->pager_control);
1234         vm_object_cache_unlock();
1235
1236         /*
1237          * Wait for the existing paging activity (that got
1238          * through before we nulled out the pager) to subside.
1239          */
1240
1241         vm_object_paging_wait(object, THREAD_UNINT);
1242         vm_object_unlock(object);
1243
1244         /*
1245          *      Terminate the object now.
1246          */
1247         if (old_pager != MEMORY_OBJECT_NULL) {
1248                 vm_object_release_pager(old_pager);
1249
1250                 /*
1251                  * JMM - Release the caller's reference.  This assumes the
1252                  * caller had a reference to release, which is a big (but
1253                  * currently valid) assumption if this is driven from the
1254                  * vnode pager (it is holding a named reference when making
1255                  * this call)..
1256                  */
1257                 vm_object_deallocate(object);
1258
1259         }
1260         return(KERN_SUCCESS);
1261 }
1262
1263 /*
1264  *      vm_object_deactivate_pages
1265  *
1266  *      Deactivate all pages in the specified object.  (Keep its pages
1267  *      in memory even though it is no longer referenced.)
1268  *
1269  *      The object must be locked.
1270  */
1271 static void
1272 vm_object_deactivate_all_pages(
1273         register vm_object_t    object)
1274 {
1275         register vm_page_t      p;
1276
1277         queue_iterate(&object->memq, p, vm_page_t, listq) {
1278                 vm_page_lock_queues();
1279                 if (!p->busy)
1280                         vm_page_deactivate(p);
1281                 vm_page_unlock_queues();
1282         }
1283 }
1284
1285 __private_extern__ void
1286 vm_object_deactivate_pages(
1287         vm_object_t             object,
1288         vm_object_offset_t      offset,
1289         vm_object_size_t        size,
1290         boolean_t               kill_page)
1291 {
1292         vm_object_t             orig_object;
1293         int pages_moved = 0;
1294         int pages_found = 0;
1295
1296         /*
1297          * entered with object lock held, acquire a paging reference to
1298          * prevent the memory_object and control ports from
1299          * being destroyed.
1300          */
1301         orig_object = object;
1302
1303         for (;;) {
1304                 register vm_page_t      m;
1305                 vm_object_offset_t      toffset;
1306                 vm_object_size_t        tsize;
1307
1308                 vm_object_paging_begin(object);
1309                 vm_page_lock_queues();
1310
1311                 for (tsize = size, toffset = offset; tsize; tsize -= PAGE_SIZE, toffset += PAGE_SIZE) {
1312
1313                         if ((m = vm_page_lookup(object, toffset)) != VM_PAGE_NULL) {
1314
1315                                 pages_found++;
1316
1317                                 if ((m->wire_count == 0) && (!m->private) && (!m->gobbled) && (!m->busy)) {
1318
1319                                         assert(!m->laundry);
1320
1321                                         m->reference = FALSE;
1322                                         pmap_clear_reference(m->phys_page);
1323
1324                                         if ((kill_page) && (object->internal)) {
1325                                                 m->precious = FALSE;
1326                                                 m->dirty = FALSE;
1327                                                 pmap_clear_modify(m->phys_page);
1328                                                 vm_external_state_clr(object->existence_map, offset);
1329                                         }
1330                                         VM_PAGE_QUEUES_REMOVE(m);
1331
1332                                         assert(!m->laundry);
1333                                         assert(m->object != kernel_object);
1334                                         assert(m->pageq.next == NULL &&
1335                                                m->pageq.prev == NULL);
1336                                         if(m->zero_fill) {
1337                                                 queue_enter_first(
1338                                                         &vm_page_queue_zf,
1339                                                         m, vm_page_t, pageq);
1340                                         } else {
1341                                                 queue_enter_first(
1342                                                         &vm_page_queue_inactive,
1343                                                         m, vm_page_t, pageq);
1344                                         }
1345
1346                                         m->inactive = TRUE;
1347                                         if (!m->fictitious)
1348                                                 vm_page_inactive_count++;
1349
1350                                         pages_moved++;
1351                                 }
1352                         }
1353                 }
1354                 vm_page_unlock_queues();
1355                 vm_object_paging_end(object);
1356
1357                 if (object->shadow) {
1358                         vm_object_t     tmp_object;
1359
1360                         kill_page = 0;
1361
1362                         offset += object->shadow_offset;
1363
1364                         tmp_object = object->shadow;
1365                         vm_object_lock(tmp_object);
1366
1367                         if (object != orig_object)
1368                                 vm_object_unlock(object);
1369                         object = tmp_object;
1370                 } else
1371                         break;
1372         }
1373         if (object != orig_object)
1374                 vm_object_unlock(object);
1375 }
1376
1377 /*
1378  *      Routine:        vm_object_pmap_protect
1379  *
1380  *      Purpose:
1381  *              Reduces the permission for all physical
1382  *              pages in the specified object range.
1383  *
1384  *              If removing write permission only, it is
1385  *              sufficient to protect only the pages in
1386  *              the top-level object; only those pages may
1387  *              have write permission.
1388  *
1389  *              If removing all access, we must follow the
1390  *              shadow chain from the top-level object to
1391  *              remove access to all pages in shadowed objects.
1392  *
1393  *              The object must *not* be locked.  The object must
1394  *              be temporary/internal.
1395  *
1396  *              If pmap is not NULL, this routine assumes that
1397  *              the only mappings for the pages are in that
1398  *              pmap.
1399  */
1400
1401 __private_extern__ void
1402 vm_object_pmap_protect(
1403         register vm_object_t            object,
1404         register vm_object_offset_t     offset,
1405         vm_object_size_t                size,
1406         pmap_t                          pmap,
1407         vm_map_offset_t                 pmap_start,
1408         vm_prot_t                       prot)
1409 {
1410         if (object == VM_OBJECT_NULL)
1411             return;
1412         size = vm_object_round_page(size);
1413         offset = vm_object_trunc_page(offset);
1414
1415         vm_object_lock(object);
1416
1417         assert(object->internal);
1418
1419         while (TRUE) {
1420            if (ptoa_64(object->resident_page_count) > size/2 && pmap != PMAP_NULL) {
1421                 vm_object_unlock(object);
1422                 pmap_protect(pmap, pmap_start, pmap_start + size, prot);
1423                 return;
1424             }
1425
1426             /* if we are doing large ranges with respect to resident */
1427             /* page count then we should interate over pages otherwise */
1428             /* inverse page look-up will be faster */
1429             if (ptoa_64(object->resident_page_count / 4) <  size) {
1430                 vm_page_t               p;
1431                 vm_object_offset_t      end;
1432
1433                 end = offset + size;
1434
1435                 if (pmap != PMAP_NULL) {
1436                   queue_iterate(&object->memq, p, vm_page_t, listq) {
1437                     if (!p->fictitious &&
1438                         (offset <= p->offset) && (p->offset < end)) {
1439                         vm_map_offset_t start;
1440
1441                         start = pmap_start + p->offset - offset;
1442                         pmap_protect(pmap, start, start + PAGE_SIZE_64, prot);
1443                     }
1444                   }
1445                 } else {
1446                   queue_iterate(&object->memq, p, vm_page_t, listq) {
1447                     if (!p->fictitious &&
1448                         (offset <= p->offset) && (p->offset < end)) {
1449
1450                             pmap_page_protect(p->phys_page,
1451                                               prot & ~p->page_lock);
1452                     }
1453                   }
1454                 }
1455            } else {
1456                 vm_page_t               p;
1457                 vm_object_offset_t      end;
1458                 vm_object_offset_t      target_off;
1459
1460                 end = offset + size;
1461
1462                 if (pmap != PMAP_NULL) {
1463                         for(target_off = offset;
1464                             target_off < end;
1465                             target_off += PAGE_SIZE) {
1466                                 p = vm_page_lookup(object, target_off);
1467                                 if (p != VM_PAGE_NULL) {
1468                                         vm_offset_t start;
1469                                         start = pmap_start +
1470                                                 (vm_offset_t)(p->offset - offset);
1471                                         pmap_protect(pmap, start,
1472                                                         start + PAGE_SIZE, prot);
1473                                 }
1474                         }
1475                 } else {
1476                         for(target_off = offset;
1477                                 target_off < end; target_off += PAGE_SIZE) {
1478                                 p = vm_page_lookup(object, target_off);
1479                                 if (p != VM_PAGE_NULL) {
1480                                         pmap_page_protect(p->phys_page,
1481                                                       prot & ~p->page_lock);
1482                                 }
1483                         }
1484                 }
1485           }
1486
1487             if (prot == VM_PROT_NONE) {
1488                 /*
1489                  * Must follow shadow chain to remove access
1490                  * to pages in shadowed objects.
1491                  */
1492                 register vm_object_t    next_object;
1493
1494                 next_object = object->shadow;
1495                 if (next_object != VM_OBJECT_NULL) {
1496                     offset += object->shadow_offset;
1497                     vm_object_lock(next_object);
1498                     vm_object_unlock(object);
1499                     object = next_object;
1500                 }
1501                 else {
1502                     /*
1503                      * End of chain - we are done.
1504                      */
1505                     break;
1506                 }
1507             }
1508             else {
1509                 /*
1510                  * Pages in shadowed objects may never have
1511                  * write permission - we may stop here.
1512                  */
1513                 break;
1514             }
1515         }
1516
1517         vm_object_unlock(object);
1518 }
1519
1520 /*
1521  *      Routine:        vm_object_copy_slowly
1522  *
1523  *      Description:
1524  *              Copy the specified range of the source
1525  *              virtual memory object without using
1526  *              protection-based optimizations (such
1527  *              as copy-on-write).  The pages in the
1528  *              region are actually copied.
1529  *
1530  *      In/out conditions:
1531  *              The caller must hold a reference and a lock
1532  *              for the source virtual memory object.  The source
1533  *              object will be returned *unlocked*.
1534  *
1535  *      Results:
1536  *              If the copy is completed successfully, KERN_SUCCESS is
1537  *              returned.  If the caller asserted the interruptible
1538  *              argument, and an interruption occurred while waiting
1539  *              for a user-generated event, MACH_SEND_INTERRUPTED is
1540  *              returned.  Other values may be returned to indicate
1541  *              hard errors during the copy operation.
1542  *
1543  *              A new virtual memory object is returned in a
1544  *              parameter (_result_object).  The contents of this
1545  *              new object, starting at a zero offset, are a copy
1546  *              of the source memory region.  In the event of
1547  *              an error, this parameter will contain the value
1548  *              VM_OBJECT_NULL.
1549  */
1550 __private_extern__ kern_return_t
1551 vm_object_copy_slowly(
1552         register vm_object_t    src_object,
1553         vm_object_offset_t      src_offset,
1554         vm_object_size_t        size,
1555         boolean_t               interruptible,
1556         vm_object_t             *_result_object)        /* OUT */
1557 {
1558         vm_object_t             new_object;
1559         vm_object_offset_t      new_offset;
1560
1561         vm_object_offset_t      src_lo_offset = src_offset;
1562         vm_object_offset_t      src_hi_offset = src_offset + size;
1563
1564         XPR(XPR_VM_OBJECT, "v_o_c_slowly obj 0x%x off 0x%x size 0x%x\n",
1565             src_object, src_offset, size, 0, 0);
1566
1567         if (size == 0) {
1568                 vm_object_unlock(src_object);
1569                 *_result_object = VM_OBJECT_NULL;
1570                 return(KERN_INVALID_ARGUMENT);
1571         }
1572
1573         /*
1574          *      Prevent destruction of the source object while we copy.
1575          */
1576
1577         assert(src_object->ref_count > 0);
1578         src_object->ref_count++;
1579         VM_OBJ_RES_INCR(src_object);
1580         vm_object_unlock(src_object);
1581
1582         /*
1583          *      Create a new object to hold the copied pages.
1584          *      A few notes:
1585          *              We fill the new object starting at offset 0,
1586          *               regardless of the input offset.
1587          *              We don't bother to lock the new object within
1588          *               this routine, since we have the only reference.
1589          */
1590
1591         new_object = vm_object_allocate(size);
1592         new_offset = 0;
1593         vm_object_lock(new_object);
1594
1595         assert(size == trunc_page_64(size));    /* Will the loop terminate? */
1596
1597         for ( ;
1598             size != 0 ;
1599             src_offset += PAGE_SIZE_64,
1600                         new_offset += PAGE_SIZE_64, size -= PAGE_SIZE_64
1601             ) {
1602                 vm_page_t       new_page;
1603                 vm_fault_return_t result;
1604
1605                 while ((new_page = vm_page_alloc(new_object, new_offset))
1606                                 == VM_PAGE_NULL) {
1607                         if (!vm_page_wait(interruptible)) {
1608                                 vm_object_unlock(new_object);
1609                                 vm_object_deallocate(new_object);
1610                                 vm_object_deallocate(src_object);
1611                                 *_result_object = VM_OBJECT_NULL;
1612                                 return(MACH_SEND_INTERRUPTED);
1613                         }
1614                 }
1615
1616                 do {
1617                         vm_prot_t       prot = VM_PROT_READ;
1618                         vm_page_t       _result_page;
1619                         vm_page_t       top_page;
1620                         register
1621                         vm_page_t       result_page;
1622                         kern_return_t   error_code;
1623
1624                         vm_object_lock(src_object);
1625                         vm_object_paging_begin(src_object);
1626
1627                         XPR(XPR_VM_FAULT,"vm_object_copy_slowly -> vm_fault_page",0,0,0,0,0);
1628                         result = vm_fault_page(src_object, src_offset,
1629                                 VM_PROT_READ, FALSE, interruptible,
1630                                 src_lo_offset, src_hi_offset,
1631                                 VM_BEHAVIOR_SEQUENTIAL,
1632                                 &prot, &_result_page, &top_page,
1633                                 (int *)0,
1634                                 &error_code, FALSE, FALSE, NULL, 0);
1635
1636                         switch(result) {
1637                                 case VM_FAULT_SUCCESS:
1638                                         result_page = _result_page;
1639
1640                                         /*
1641                                          *      We don't need to hold the object
1642                                          *      lock -- the busy page will be enough.
1643                                          *      [We don't care about picking up any
1644                                          *      new modifications.]
1645                                          *
1646                                          *      Copy the page to the new object.
1647                                          *
1648                                          *      POLICY DECISION:
1649                                          *              If result_page is clean,
1650                                          *              we could steal it instead
1651                                          *              of copying.
1652                                          */
1653
1654                                         vm_object_unlock(result_page->object);
1655                                         vm_page_copy(result_page, new_page);
1656
1657                                         /*
1658                                          *      Let go of both pages (make them
1659                                          *      not busy, perform wakeup, activate).
1660                                          */
1661
1662                                         new_page->busy = FALSE;
1663                                         new_page->dirty = TRUE;
1664                                         vm_object_lock(result_page->object);
1665                                         PAGE_WAKEUP_DONE(result_page);
1666
1667                                         vm_page_lock_queues();
1668                                         if (!result_page->active &&
1669                                             !result_page->inactive)
1670                                                 vm_page_activate(result_page);
1671                                         vm_page_activate(new_page);
1672                                         vm_page_unlock_queues();
1673
1674                                         /*
1675                                          *      Release paging references and
1676                                          *      top-level placeholder page, if any.
1677                                          */
1678
1679                                         vm_fault_cleanup(result_page->object,
1680                                                         top_page);
1681
1682                                         break;
1683
1684                                 case VM_FAULT_RETRY:
1685                                         break;
1686
1687                                 case VM_FAULT_FICTITIOUS_SHORTAGE:
1688                                         vm_page_more_fictitious();
1689                                         break;
1690
1691                                 case VM_FAULT_MEMORY_SHORTAGE:
1692                                         if (vm_page_wait(interruptible))
1693                                                 break;
1694                                         /* fall thru */
1695
1696                                 case VM_FAULT_INTERRUPTED:
1697                                         vm_page_free(new_page);
1698                                         vm_object_unlock(new_object);
1699                                         vm_object_deallocate(new_object);
1700                                         vm_object_deallocate(src_object);
1701                                         *_result_object = VM_OBJECT_NULL;
1702                                         return(MACH_SEND_INTERRUPTED);
1703
1704                                 case VM_FAULT_MEMORY_ERROR:
1705                                         /*
1706                                          * A policy choice:
1707                                          *      (a) ignore pages that we can't
1708                                          *          copy
1709                                          *      (b) return the null object if
1710                                          *          any page fails [chosen]
1711                                          */
1712
1713                                         vm_page_lock_queues();
1714                                         vm_page_free(new_page);
1715                                         vm_page_unlock_queues();
1716                                         vm_object_unlock(new_object);
1717                                         vm_object_deallocate(new_object);
1718                                         vm_object_deallocate(src_object);
1719                                         *_result_object = VM_OBJECT_NULL;
1720                                         return(error_code ? error_code:
1721                                                 KERN_MEMORY_ERROR);
1722                         }
1723                 } while (result != VM_FAULT_SUCCESS);
1724         }
1725
1726         /*
1727          *      Lose the extra reference, and return our object.
1728          */
1729
1730         vm_object_unlock(new_object);
1731         vm_object_deallocate(src_object);
1732         *_result_object = new_object;
1733         return(KERN_SUCCESS);
1734 }
1735
1736 /*
1737  *      Routine:        vm_object_copy_quickly
1738  *
1739  *      Purpose:
1740  *              Copy the specified range of the source virtual
1741  *              memory object, if it can be done without waiting
1742  *              for user-generated events.
1743  *
1744  *      Results:
1745  *              If the copy is successful, the copy is returned in
1746  *              the arguments; otherwise, the arguments are not
1747  *              affected.
1748  *
1749  *      In/out conditions:
1750  *              The object should be unlocked on entry and exit.
1751  */
1752
1753 /*ARGSUSED*/
1754 __private_extern__ boolean_t
1755 vm_object_copy_quickly(
1756         vm_object_t             *_object,               /* INOUT */
1757         __unused vm_object_offset_t     offset, /* IN */
1758         __unused vm_object_size_t       size,   /* IN */
1759         boolean_t               *_src_needs_copy,       /* OUT */
1760         boolean_t               *_dst_needs_copy)       /* OUT */
1761 {
1762         vm_object_t     object = *_object;
1763         memory_object_copy_strategy_t copy_strategy;
1764
1765         XPR(XPR_VM_OBJECT, "v_o_c_quickly obj 0x%x off 0x%x size 0x%x\n",
1766             *_object, offset, size, 0, 0);
1767         if (object == VM_OBJECT_NULL) {
1768                 *_src_needs_copy = FALSE;
1769                 *_dst_needs_copy = FALSE;
1770                 return(TRUE);
1771         }
1772
1773         vm_object_lock(object);
1774
1775         copy_strategy = object->copy_strategy;
1776
1777         switch (copy_strategy) {
1778         case MEMORY_OBJECT_COPY_SYMMETRIC:
1779
1780                 /*
1781                  *      Symmetric copy strategy.
1782                  *      Make another reference to the object.
1783                  *      Leave object/offset unchanged.
1784                  */
1785
1786                 assert(object->ref_count > 0);
1787                 object->ref_count++;
1788                 vm_object_res_reference(object);
1789                 object->shadowed = TRUE;
1790                 vm_object_unlock(object);
1791
1792                 /*
1793                  *      Both source and destination must make
1794                  *      shadows, and the source must be made
1795                  *      read-only if not already.
1796                  */
1797
1798                 *_src_needs_copy = TRUE;
1799                 *_dst_needs_copy = TRUE;
1800
1801                 break;
1802
1803         case MEMORY_OBJECT_COPY_DELAY:
1804                 vm_object_unlock(object);
1805                 return(FALSE);
1806
1807         default:
1808                 vm_object_unlock(object);
1809                 return(FALSE);
1810         }
1811         return(TRUE);
1812 }
1813
1814 static int copy_call_count = 0;
1815 static int copy_call_sleep_count = 0;
1816 static int copy_call_restart_count = 0;
1817
1818 /*
1819  *      Routine:        vm_object_copy_call [internal]
1820  *
1821  *      Description:
1822  *              Copy the source object (src_object), using the
1823  *              user-managed copy algorithm.
1824  *
1825  *      In/out conditions:
1826  *              The source object must be locked on entry.  It
1827  *              will be *unlocked* on exit.
1828  *
1829  *      Results:
1830  *              If the copy is successful, KERN_SUCCESS is returned.
1831  *              A new object that represents the copied virtual
1832  *              memory is returned in a parameter (*_result_object).
1833  *              If the return value indicates an error, this parameter
1834  *              is not valid.
1835  */
1836 static kern_return_t
1837 vm_object_copy_call(
1838         vm_object_t             src_object,
1839         vm_object_offset_t      src_offset,
1840         vm_object_size_t        size,
1841         vm_object_t             *_result_object)        /* OUT */
1842 {
1843         kern_return_t   kr;
1844         vm_object_t     copy;
1845         boolean_t       check_ready = FALSE;
1846
1847         /*
1848          *      If a copy is already in progress, wait and retry.
1849          *
1850          *      XXX
1851          *      Consider making this call interruptable, as Mike
1852          *      intended it to be.
1853          *
1854          *      XXXO
1855          *      Need a counter or version or something to allow
1856          *      us to use the copy that the currently requesting
1857          *      thread is obtaining -- is it worth adding to the
1858          *      vm object structure? Depends how common this case it.
1859          */
1860         copy_call_count++;
1861         while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) {
1862                 vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL,
1863                                THREAD_UNINT);
1864                 copy_call_restart_count++;
1865         }
1866
1867         /*
1868          *      Indicate (for the benefit of memory_object_create_copy)
1869          *      that we want a copy for src_object. (Note that we cannot
1870          *      do a real assert_wait before calling memory_object_copy,
1871          *      so we simply set the flag.)
1872          */
1873
1874         vm_object_set_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL);
1875         vm_object_unlock(src_object);
1876
1877         /*
1878          *      Ask the memory manager to give us a memory object
1879          *      which represents a copy of the src object.
1880          *      The memory manager may give us a memory object
1881          *      which we already have, or it may give us a
1882          *      new memory object. This memory object will arrive
1883          *      via memory_object_create_copy.
1884          */
1885
1886         kr = KERN_FAILURE;      /* XXX need to change memory_object.defs */
1887         if (kr != KERN_SUCCESS) {
1888                 return kr;
1889         }
1890
1891         /*
1892          *      Wait for the copy to arrive.
1893          */
1894         vm_object_lock(src_object);
1895         while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) {
1896                 vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL,
1897                                THREAD_UNINT);
1898                 copy_call_sleep_count++;
1899         }
1900 Retry:
1901         assert(src_object->copy != VM_OBJECT_NULL);
1902         copy = src_object->copy;
1903         if (!vm_object_lock_try(copy)) {
1904                 vm_object_unlock(src_object);
1905                 mutex_pause();  /* wait a bit */
1906                 vm_object_lock(src_object);
1907                 goto Retry;
1908         }
1909         if (copy->size < src_offset+size)
1910                 copy->size = src_offset+size;
1911
1912         if (!copy->pager_ready)
1913                 check_ready = TRUE;
1914
1915         /*
1916          *      Return the copy.
1917          */
1918         *_result_object = copy;
1919         vm_object_unlock(copy);
1920         vm_object_unlock(src_object);
1921
1922         /* Wait for the copy to be ready. */
1923         if (check_ready == TRUE) {
1924                 vm_object_lock(copy);
1925                 while (!copy->pager_ready) {
1926                         vm_object_sleep(copy, VM_OBJECT_EVENT_PAGER_READY, THREAD_UNINT);
1927                 }
1928                 vm_object_unlock(copy);
1929         }
1930
1931         return KERN_SUCCESS;
1932 }
1933
1934 static int copy_delayed_lock_collisions = 0;
1935 static int copy_delayed_max_collisions = 0;
1936 static int copy_delayed_lock_contention = 0;
1937 static int copy_delayed_protect_iterate = 0;
1938
1939 /*
1940  *      Routine:        vm_object_copy_delayed [internal]
1941  *
1942  *      Description:
1943  *              Copy the specified virtual memory object, using
1944  *              the asymmetric copy-on-write algorithm.
1945  *
1946  *      In/out conditions:
1947  *              The src_object must be locked on entry.  It will be unlocked
1948  *              on exit - so the caller must also hold a reference to it.
1949  *
1950  *              This routine will not block waiting for user-generated
1951  *              events.  It is not interruptible.
1952  */
1953 __private_extern__ vm_object_t
1954 vm_object_copy_delayed(
1955         vm_object_t             src_object,
1956         vm_object_offset_t      src_offset,
1957         vm_object_size_t        size)
1958 {
1959         vm_object_t             new_copy = VM_OBJECT_NULL;
1960         vm_object_t             old_copy;
1961         vm_page_t               p;
1962         vm_object_size_t        copy_size = src_offset + size;
1963
1964         int collisions = 0;
1965         /*
1966          *      The user-level memory manager wants to see all of the changes
1967          *      to this object, but it has promised not to make any changes on
1968          *      its own.
1969          *
1970          *      Perform an asymmetric copy-on-write, as follows:
1971          *              Create a new object, called a "copy object" to hold
1972          *               pages modified by the new mapping  (i.e., the copy,
1973          *               not the original mapping).
1974          *              Record the original object as the backing object for
1975          *               the copy object.  If the original mapping does not
1976          *               change a page, it may be used read-only by the copy.
1977          *              Record the copy object in the original object.
1978          *               When the original mapping causes a page to be modified,
1979          *               it must be copied to a new page that is "pushed" to
1980          *               the copy object.
1981          *              Mark the new mapping (the copy object) copy-on-write.
1982          *               This makes the copy object itself read-only, allowing
1983          *               it to be reused if the original mapping makes no
1984          *               changes, and simplifying the synchronization required
1985          *               in the "push" operation described above.
1986          *
1987          *      The copy-on-write is said to be assymetric because the original
1988          *      object is *not* marked copy-on-write. A copied page is pushed
1989          *      to the copy object, regardless which party attempted to modify
1990          *      the page.
1991          *
1992          *      Repeated asymmetric copy operations may be done. If the
1993          *      original object has not been changed since the last copy, its
1994          *      copy object can be reused. Otherwise, a new copy object can be
1995          *      inserted between the original object and its previous copy
1996          *      object.  Since any copy object is read-only, this cannot affect
1997          *      affect the contents of the previous copy object.
1998          *
1999          *      Note that a copy object is higher in the object tree than the
2000          *      original object; therefore, use of the copy object recorded in
2001          *      the original object must be done carefully, to avoid deadlock.
2002          */
2003
2004  Retry:
2005
2006         /*
2007          * Wait for paging in progress.
2008          */
2009         if (!src_object->true_share)
2010                 vm_object_paging_wait(src_object, THREAD_UNINT);
2011
2012         /*
2013          *      See whether we can reuse the result of a previous
2014          *      copy operation.
2015          */
2016
2017         old_copy = src_object->copy;
2018         if (old_copy != VM_OBJECT_NULL) {
2019                 /*
2020                  *      Try to get the locks (out of order)
2021                  */
2022                 if (!vm_object_lock_try(old_copy)) {
2023                         vm_object_unlock(src_object);
2024                         mutex_pause();
2025
2026                         /* Heisenberg Rules */
2027                         copy_delayed_lock_collisions++;
2028                         if (collisions++ == 0)
2029                                 copy_delayed_lock_contention++;
2030
2031                         if (collisions > copy_delayed_max_collisions)
2032                                 copy_delayed_max_collisions = collisions;
2033
2034                         vm_object_lock(src_object);
2035                         goto Retry;
2036                 }
2037
2038                 /*
2039                  *      Determine whether the old copy object has
2040                  *      been modified.
2041                  */
2042
2043                 if (old_copy->resident_page_count == 0 &&
2044                     !old_copy->pager_created) {
2045                         /*
2046                          *      It has not been modified.
2047                          *
2048                          *      Return another reference to
2049                          *      the existing copy-object if
2050                          *      we can safely grow it (if
2051                          *      needed).
2052                          */
2053
2054                         if (old_copy->size < copy_size) {
2055                                 /*
2056                                  * We can't perform a delayed copy if any of the
2057                                  * pages in the extended range are wired (because
2058                                  * we can't safely take write permission away from
2059                                  * wired pages).  If the pages aren't wired, then
2060                                  * go ahead and protect them.
2061                                  */
2062                                 copy_delayed_protect_iterate++;
2063                                 queue_iterate(&src_object->memq, p, vm_page_t, listq) {
2064                                         if (!p->fictitious &&
2065                                             p->offset >= old_copy->size &&
2066                                             p->offset < copy_size) {
2067                                                 if (p->wire_count > 0) {
2068                                                         vm_object_unlock(old_copy);
2069                                                         vm_object_unlock(src_object);
2070
2071                                                         if (new_copy != VM_OBJECT_NULL) {
2072                                                                 vm_object_unlock(new_copy);
2073                                                                 vm_object_deallocate(new_copy);
2074                                                         }
2075
2076                                                         return VM_OBJECT_NULL;
2077                                                 } else {
2078                                                         pmap_page_protect(p->phys_page,
2079                                                                 (VM_PROT_ALL & ~VM_PROT_WRITE &
2080                                                                  ~p->page_lock));
2081                                                 }
2082                                         }
2083                                 }
2084                                 old_copy->size = copy_size;
2085                         }
2086
2087                         vm_object_reference_locked(old_copy);
2088                         vm_object_unlock(old_copy);
2089                         vm_object_unlock(src_object);
2090
2091                         if (new_copy != VM_OBJECT_NULL) {
2092                                 vm_object_unlock(new_copy);
2093                                 vm_object_deallocate(new_copy);
2094                         }
2095
2096                         return(old_copy);
2097                 }
2098
2099                 /*
2100                  * Adjust the size argument so that the newly-created
2101                  * copy object will be large enough to back either the
2102                  * old copy object or the new mapping.
2103                  */
2104                 if (old_copy->size > copy_size)
2105                         copy_size = old_copy->size;
2106
2107                 if (new_copy == VM_OBJECT_NULL) {
2108                         vm_object_unlock(old_copy);
2109                         vm_object_unlock(src_object);
2110                         new_copy = vm_object_allocate(copy_size);
2111                         vm_object_lock(src_object);
2112                         vm_object_lock(new_copy);
2113                         goto Retry;
2114                 }
2115                 new_copy->size = copy_size;
2116
2117                 /*
2118                  *      The copy-object is always made large enough to
2119                  *      completely shadow the original object, since
2120                  *      it may have several users who want to shadow
2121                  *      the original object at different points.
2122                  */
2123
2124                 assert((old_copy->shadow == src_object) &&
2125                     (old_copy->shadow_offset == (vm_object_offset_t) 0));
2126
2127         } else if (new_copy == VM_OBJECT_NULL) {
2128                 vm_object_unlock(src_object);
2129                 new_copy = vm_object_allocate(copy_size);
2130                 vm_object_lock(src_object);
2131                 vm_object_lock(new_copy);
2132                 goto Retry;
2133         }
2134
2135         /*
2136          * We now have the src object locked, and the new copy object
2137          * allocated and locked (and potentially the old copy locked).
2138          * Before we go any further, make sure we can still perform
2139          * a delayed copy, as the situation may have changed.
2140          *
2141          * Specifically, we can't perform a delayed copy if any of the
2142          * pages in the range are wired (because we can't safely take
2143          * write permission away from wired pages).  If the pages aren't
2144          * wired, then go ahead and protect them.
2145          */
2146         copy_delayed_protect_iterate++;
2147         queue_iterate(&src_object->memq, p, vm_page_t, listq) {
2148                 if (!p->fictitious && p->offset < copy_size) {
2149                         if (p->wire_count > 0) {
2150                                 if (old_copy)
2151                                         vm_object_unlock(old_copy);
2152                                 vm_object_unlock(src_object);
2153                                 vm_object_unlock(new_copy);
2154                                 vm_object_deallocate(new_copy);
2155                                 return VM_OBJECT_NULL;
2156                         } else {
2157                                 pmap_page_protect(p->phys_page,
2158                                         (VM_PROT_ALL & ~VM_PROT_WRITE &
2159                                          ~p->page_lock));
2160                         }
2161                 }
2162         }
2163
2164         if (old_copy != VM_OBJECT_NULL) {
2165                 /*
2166                  *      Make the old copy-object shadow the new one.
2167                  *      It will receive no more pages from the original
2168                  *      object.
2169                  */
2170
2171                 src_object->ref_count--;        /* remove ref. from old_copy */
2172                 assert(src_object->ref_count > 0);
2173                 old_copy->shadow = new_copy;
2174                 assert(new_copy->ref_count > 0);
2175                 new_copy->ref_count++;          /* for old_copy->shadow ref. */
2176
2177 #if TASK_SWAPPER
2178                 if (old_copy->res_count) {
2179                         VM_OBJ_RES_INCR(new_copy);
2180                         VM_OBJ_RES_DECR(src_object);
2181                 }
2182 #endif
2183
2184                 vm_object_unlock(old_copy);     /* done with old_copy */
2185         }
2186
2187         /*
2188          *      Point the new copy at the existing object.
2189          */
2190         new_copy->shadow = src_object;
2191         new_copy->shadow_offset = 0;
2192         new_copy->shadowed = TRUE;      /* caller must set needs_copy */
2193         assert(src_object->ref_count > 0);
2194         src_object->ref_count++;
2195         VM_OBJ_RES_INCR(src_object);
2196         src_object->copy = new_copy;
2197         vm_object_unlock(src_object);
2198         vm_object_unlock(new_copy);
2199
2200         XPR(XPR_VM_OBJECT,
2201                 "vm_object_copy_delayed: used copy object %X for source %X\n",
2202                 (integer_t)new_copy, (integer_t)src_object, 0, 0, 0);
2203
2204         return(new_copy);
2205 }
2206
2207 /*
2208  *      Routine:        vm_object_copy_strategically
2209  *
2210  *      Purpose:
2211  *              Perform a copy according to the source object's
2212  *              declared strategy.  This operation may block,
2213  *              and may be interrupted.
2214  */
2215 __private_extern__ kern_return_t
2216 vm_object_copy_strategically(
2217         register vm_object_t    src_object,
2218         vm_object_offset_t      src_offset,
2219         vm_object_size_t        size,
2220         vm_object_t             *dst_object,    /* OUT */
2221         vm_object_offset_t      *dst_offset,    /* OUT */
2222         boolean_t               *dst_needs_copy) /* OUT */
2223 {
2224         boolean_t       result;
2225         boolean_t       interruptible = THREAD_ABORTSAFE; /* XXX */
2226         memory_object_copy_strategy_t copy_strategy;
2227
2228         assert(src_object != VM_OBJECT_NULL);
2229
2230         vm_object_lock(src_object);
2231
2232         /*
2233          *      The copy strategy is only valid if the memory manager
2234          *      is "ready". Internal objects are always ready.
2235          */
2236
2237         while (!src_object->internal && !src_object->pager_ready) {
2238                 wait_result_t wait_result;
2239
2240                 wait_result = vm_object_sleep(  src_object,
2241                                                 VM_OBJECT_EVENT_PAGER_READY,
2242                                                 interruptible);
2243                 if (wait_result != THREAD_AWAKENED) {
2244                         vm_object_unlock(src_object);
2245                         *dst_object = VM_OBJECT_NULL;
2246                         *dst_offset = 0;
2247                         *dst_needs_copy = FALSE;
2248                         return(MACH_SEND_INTERRUPTED);
2249                 }
2250         }
2251
2252         copy_strategy = src_object->copy_strategy;
2253
2254         /*
2255          *      Use the appropriate copy strategy.
2256          */
2257
2258         switch (copy_strategy) {
2259             case MEMORY_OBJECT_COPY_DELAY:
2260                 *dst_object = vm_object_copy_delayed(src_object,
2261                                                      src_offset, size);
2262                 if (*dst_object != VM_OBJECT_NULL) {
2263                         *dst_offset = src_offset;
2264                         *dst_needs_copy = TRUE;
2265                         result = KERN_SUCCESS;
2266                         break;
2267                 }
2268                 vm_object_lock(src_object);
2269                 /* fall thru when delayed copy not allowed */
2270
2271             case MEMORY_OBJECT_COPY_NONE:
2272                 result = vm_object_copy_slowly(src_object, src_offset, size,
2273                                                interruptible, dst_object);
2274                 if (result == KERN_SUCCESS) {
2275                         *dst_offset = 0;
2276                         *dst_needs_copy = FALSE;
2277                 }
2278                 break;
2279
2280             case MEMORY_OBJECT_COPY_CALL:
2281                 result = vm_object_copy_call(src_object, src_offset, size,
2282                                 dst_object);
2283                 if (result == KERN_SUCCESS) {
2284                         *dst_offset = src_offset;
2285                         *dst_needs_copy = TRUE;
2286                 }
2287                 break;
2288
2289             case MEMORY_OBJECT_COPY_SYMMETRIC:
2290                 XPR(XPR_VM_OBJECT, "v_o_c_strategically obj 0x%x off 0x%x size 0x%x\n",(natural_t)src_object, src_offset, size, 0, 0);
2291                 vm_object_unlock(src_object);
2292                 result = KERN_MEMORY_RESTART_COPY;
2293                 break;
2294
2295             default:
2296                 panic("copy_strategically: bad strategy");
2297                 result = KERN_INVALID_ARGUMENT;
2298         }
2299         return(result);
2300 }
2301
2302 /*
2303  *      vm_object_shadow:
2304  *
2305  *      Create a new object which is backed by the
2306  *      specified existing object range.  The source
2307  *      object reference is deallocated.
2308  *
2309  *      The new object and offset into that object
2310  *      are returned in the source parameters.
2311  */
2312 boolean_t vm_object_shadow_check = FALSE;
2313
2314 __private_extern__ boolean_t
2315 vm_object_shadow(
2316         vm_object_t             *object,        /* IN/OUT */
2317         vm_object_offset_t      *offset,        /* IN/OUT */
2318         vm_object_size_t        length)
2319 {
2320         register vm_object_t    source;
2321         register vm_object_t    result;
2322
2323         source = *object;
2324         assert(source->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
2325
2326         /*
2327          *      Determine if we really need a shadow.
2328          */
2329
2330         if (vm_object_shadow_check && source->ref_count == 1 &&
2331             (source->shadow == VM_OBJECT_NULL ||
2332              source->shadow->copy == VM_OBJECT_NULL))
2333         {
2334                 source->shadowed = FALSE;
2335                 return FALSE;
2336         }
2337
2338         /*
2339          *      Allocate a new object with the given length
2340          */
2341
2342         if ((result = vm_object_allocate(length)) == VM_OBJECT_NULL)
2343                 panic("vm_object_shadow: no object for shadowing");
2344
2345         /*
2346          *      The new object shadows the source object, adding
2347          *      a reference to it.  Our caller changes his reference
2348          *      to point to the new object, removing a reference to
2349          *      the source object.  Net result: no change of reference
2350          *      count.
2351          */
2352         result->shadow = source;
2353
2354         /*
2355          *      Store the offset into the source object,
2356          *      and fix up the offset into the new object.
2357          */
2358
2359         result->shadow_offset = *offset;
2360
2361         /*
2362          *      Return the new things
2363          */
2364
2365         *offset = 0;
2366         *object = result;
2367         return TRUE;
2368 }
2369
2370 /*
2371  *      The relationship between vm_object structures and
2372  *      the memory_object requires careful synchronization.
2373  *
2374  *      All associations are created by memory_object_create_named
2375  *  for external pagers and vm_object_pager_create for internal
2376  *  objects as follows:
2377  *
2378  *              pager:  the memory_object itself, supplied by
2379  *                      the user requesting a mapping (or the kernel,
2380  *                      when initializing internal objects); the
2381  *                      kernel simulates holding send rights by keeping
2382  *                      a port reference;
2383  *
2384  *              pager_request:
2385  *                      the memory object control port,
2386  *                      created by the kernel; the kernel holds
2387  *                      receive (and ownership) rights to this
2388  *                      port, but no other references.
2389  *
2390  *      When initialization is complete, the "initialized" field
2391  *      is asserted.  Other mappings using a particular memory object,
2392  *      and any references to the vm_object gained through the
2393  *      port association must wait for this initialization to occur.
2394  *
2395  *      In order to allow the memory manager to set attributes before
2396  *      requests (notably virtual copy operations, but also data or
2397  *      unlock requests) are made, a "ready" attribute is made available.
2398  *      Only the memory manager may affect the value of this attribute.
2399  *      Its value does not affect critical kernel functions, such as
2400  *      internal object initialization or destruction.  [Furthermore,
2401  *      memory objects created by the kernel are assumed to be ready
2402  *      immediately; the default memory manager need not explicitly
2403  *      set the "ready" attribute.]
2404  *
2405  *      [Both the "initialized" and "ready" attribute wait conditions
2406  *      use the "pager" field as the wait event.]
2407  *
2408  *      The port associations can be broken down by any of the
2409  *      following routines:
2410  *              vm_object_terminate:
2411  *                      No references to the vm_object remain, and
2412  *                      the object cannot (or will not) be cached.
2413  *                      This is the normal case, and is done even
2414  *                      though one of the other cases has already been
2415  *                      done.
2416  *              memory_object_destroy:
2417  *                      The memory manager has requested that the
2418  *                      kernel relinquish references to the memory
2419  *                      object. [The memory manager may not want to
2420  *                      destroy the memory object, but may wish to
2421  *                      refuse or tear down existing memory mappings.]
2422  *
2423  *      Each routine that breaks an association must break all of
2424  *      them at once.  At some later time, that routine must clear
2425  *      the pager field and release the memory object references.
2426  *      [Furthermore, each routine must cope with the simultaneous
2427  *      or previous operations of the others.]
2428  *
2429  *      In addition to the lock on the object, the vm_object_cache_lock
2430  *      governs the associations.  References gained through the
2431  *      association require use of the cache lock.
2432  *
2433  *      Because the pager field may be cleared spontaneously, it
2434  *      cannot be used to determine whether a memory object has
2435  *      ever been associated with a particular vm_object.  [This
2436  *      knowledge is important to the shadow object mechanism.]
2437  *      For this reason, an additional "created" attribute is
2438  *      provided.
2439  *
2440  *      During various paging operations, the pager reference found in the
2441  *      vm_object must be valid.  To prevent this from being released,
2442  *      (other than being removed, i.e., made null), routines may use
2443  *      the vm_object_paging_begin/end routines [actually, macros].
2444  *      The implementation uses the "paging_in_progress" and "wanted" fields.
2445  *      [Operations that alter the validity of the pager values include the
2446  *      termination routines and vm_object_collapse.]
2447  */
2448
2449 #if 0
2450 static void             vm_object_abort_activity(
2451                                 vm_object_t     object);
2452
2453 /*
2454  *      Routine:        vm_object_abort_activity [internal use only]
2455  *      Purpose:
2456  *              Abort paging requests pending on this object.
2457  *      In/out conditions:
2458  *              The object is locked on entry and exit.
2459  */
2460 static void
2461 vm_object_abort_activity(
2462         vm_object_t     object)
2463 {
2464         register
2465         vm_page_t       p;
2466         vm_page_t       next;
2467
2468         XPR(XPR_VM_OBJECT, "vm_object_abort_activity, object 0x%X\n",
2469                 (integer_t)object, 0, 0, 0, 0);
2470
2471         /*
2472          *      Abort all activity that would be waiting
2473          *      for a result on this memory object.
2474          *
2475          *      We could also choose to destroy all pages
2476          *      that we have in memory for this object, but
2477          *      we don't.
2478          */
2479
2480         p = (vm_page_t) queue_first(&object->memq);
2481         while (!queue_end(&object->memq, (queue_entry_t) p)) {
2482                 next = (vm_page_t) queue_next(&p->listq);
2483
2484                 /*
2485                  *      If it's being paged in, destroy it.
2486                  *      If an unlock has been requested, start it again.
2487                  */
2488
2489                 if (p->busy && p->absent) {
2490                         VM_PAGE_FREE(p);
2491                 }
2492                  else {
2493                         if (p->unlock_request != VM_PROT_NONE)
2494                                 p->unlock_request = VM_PROT_NONE;
2495                         PAGE_WAKEUP(p);
2496                 }
2497
2498                 p = next;
2499         }
2500
2501         /*
2502          *      Wake up threads waiting for the memory object to
2503          *      become ready.
2504          */
2505
2506         object->pager_ready = TRUE;
2507         vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY);
2508 }
2509
2510 /*
2511  *      Routine:        vm_object_pager_dead
2512  *
2513  *      Purpose:
2514  *              A port is being destroy, and the IPC kobject code
2515  *              can't tell if it represents a pager port or not.
2516  *              So this function is called each time it sees a port
2517  *              die.
2518  *              THIS IS HORRIBLY INEFFICIENT.  We should only call
2519  *              this routine if we had requested a notification on
2520  *              the port.
2521  */
2522
2523 __private_extern__ void
2524 vm_object_pager_dead(
2525         ipc_port_t      pager)
2526 {
2527         vm_object_t             object;
2528         vm_object_hash_entry_t  entry;
2529
2530         /*
2531          *      Perform essentially the same operations as in vm_object_lookup,
2532          *      except that this time we look up based on the memory_object
2533          *      port, not the control port.
2534          */
2535         vm_object_cache_lock();
2536         entry = vm_object_hash_lookup(pager, FALSE);
2537         if (entry == VM_OBJECT_HASH_ENTRY_NULL ||
2538                         entry->object == VM_OBJECT_NULL) {
2539                 vm_object_cache_unlock();
2540                 return;
2541         }
2542
2543         object = entry->object;
2544         entry->object = VM_OBJECT_NULL;
2545
2546         vm_object_lock(object);
2547         if (object->ref_count == 0) {
2548                 XPR(XPR_VM_OBJECT_CACHE,
2549                    "vm_object_destroy: removing %x from cache, head (%x, %x)\n",
2550                         (integer_t)object,
2551                         (integer_t)vm_object_cached_list.next,
2552                         (integer_t)vm_object_cached_list.prev, 0,0);
2553
2554                 queue_remove(&vm_object_cached_list, object,
2555                                 vm_object_t, cached_list);
2556                 vm_object_cached_count--;
2557         }
2558         object->ref_count++;
2559         vm_object_res_reference(object);
2560
2561         object->can_persist = FALSE;
2562
2563         assert(object->pager == pager);
2564
2565         /*
2566          *      Remove the pager association.
2567          *
2568          *      Note that the memory_object itself is dead, so
2569          *      we don't bother with it.
2570          */
2571
2572         object->pager = MEMORY_OBJECT_NULL;
2573
2574         vm_object_unlock(object);
2575         vm_object_cache_unlock();
2576
2577         vm_object_pager_wakeup(pager);
2578
2579         /*
2580          *      Release the pager reference.  Note that there's no
2581          *      point in trying the memory_object_terminate call
2582          *      because the memory_object itself is dead.  Also
2583          *      release the memory_object_control reference, since
2584          *      the pager didn't do that either.
2585          */
2586
2587         memory_object_deallocate(pager);
2588         memory_object_control_deallocate(object->pager_request);
2589
2590
2591         /*
2592          *      Restart pending page requests
2593          */
2594         vm_object_lock(object);
2595         vm_object_abort_activity(object);
2596         vm_object_unlock(object);
2597
2598         /*
2599          *      Lose the object reference.
2600          */
2601
2602         vm_object_deallocate(object);
2603 }
2604 #endif
2605
2606 /*
2607  *      Routine:        vm_object_enter
2608  *      Purpose:
2609  *              Find a VM object corresponding to the given
2610  *              pager; if no such object exists, create one,
2611  *              and initialize the pager.
2612  */
2613 vm_object_t
2614 vm_object_enter(
2615         memory_object_t         pager,
2616         vm_object_size_t        size,
2617         boolean_t               internal,
2618         boolean_t               init,
2619         boolean_t               named)
2620 {
2621         register vm_object_t    object;
2622         vm_object_t             new_object;
2623         boolean_t               must_init;
2624         vm_object_hash_entry_t  entry, new_entry;
2625
2626         if (pager == MEMORY_OBJECT_NULL)
2627                 return(vm_object_allocate(size));
2628
2629         new_object = VM_OBJECT_NULL;
2630         new_entry = VM_OBJECT_HASH_ENTRY_NULL;
2631         must_init = init;
2632
2633         /*
2634          *      Look for an object associated with this port.
2635          */
2636
2637         vm_object_cache_lock();
2638         do {
2639                 entry = vm_object_hash_lookup(pager, FALSE);
2640
2641                 if (entry == VM_OBJECT_HASH_ENTRY_NULL) {
2642                         if (new_object == VM_OBJECT_NULL) {
2643                                 /*
2644                                  *      We must unlock to create a new object;
2645                                  *      if we do so, we must try the lookup again.
2646                                  */
2647                                 vm_object_cache_unlock();
2648                                 assert(new_entry == VM_OBJECT_HASH_ENTRY_NULL);
2649                                 new_entry = vm_object_hash_entry_alloc(pager);
2650                                 new_object = vm_object_allocate(size);
2651                                 vm_object_cache_lock();
2652                         } else {
2653                                 /*
2654                                  *      Lookup failed twice, and we have something
2655                                  *      to insert; set the object.
2656                                  */
2657                                 vm_object_hash_insert(new_entry);
2658                                 entry = new_entry;
2659                                 entry->object = new_object;
2660                                 new_entry = VM_OBJECT_HASH_ENTRY_NULL;
2661                                 new_object = VM_OBJECT_NULL;
2662                                 must_init = TRUE;
2663                         }
2664                 } else if (entry->object == VM_OBJECT_NULL) {
2665                         /*
2666                          *      If a previous object is being terminated,
2667                          *      we must wait for the termination message
2668                          *      to be queued (and lookup the entry again).
2669                          */
2670                         entry->waiting = TRUE;
2671                         entry = VM_OBJECT_HASH_ENTRY_NULL;
2672                         assert_wait((event_t) pager, THREAD_UNINT);
2673                         vm_object_cache_unlock();
2674                         thread_block(THREAD_CONTINUE_NULL);
2675                         vm_object_cache_lock();
2676                 }
2677         } while (entry == VM_OBJECT_HASH_ENTRY_NULL);
2678
2679         object = entry->object;
2680         assert(object != VM_OBJECT_NULL);
2681
2682         if (!must_init) {
2683                 vm_object_lock(object);
2684                 assert(!internal || object->internal);
2685                 if (named) {
2686                         assert(!object->named);
2687                         object->named = TRUE;
2688                 }
2689                 if (object->ref_count == 0) {
2690                         XPR(XPR_VM_OBJECT_CACHE,
2691                     "vm_object_enter: removing %x from cache, head (%x, %x)\n",
2692                                 (integer_t)object,
2693                                 (integer_t)vm_object_cached_list.next,
2694                                 (integer_t)vm_object_cached_list.prev, 0,0);
2695                         queue_remove(&vm_object_cached_list, object,
2696                                      vm_object_t, cached_list);
2697                         vm_object_cached_count--;
2698                 }
2699                 object->ref_count++;
2700                 vm_object_res_reference(object);
2701                 vm_object_unlock(object);
2702
2703                 VM_STAT(hits++);
2704         }
2705         assert(object->ref_count > 0);
2706
2707         VM_STAT(lookups++);
2708
2709         vm_object_cache_unlock();
2710
2711         XPR(XPR_VM_OBJECT,
2712                 "vm_o_enter: pager 0x%x obj 0x%x must_init %d\n",
2713                 (integer_t)pager, (integer_t)object, must_init, 0, 0);
2714
2715         /*
2716          *      If we raced to create a vm_object but lost, let's
2717          *      throw away ours.
2718          */
2719
2720         if (new_object != VM_OBJECT_NULL)
2721                 vm_object_deallocate(new_object);
2722
2723         if (new_entry != VM_OBJECT_HASH_ENTRY_NULL)
2724                 vm_object_hash_entry_free(new_entry);
2725
2726         if (must_init) {
2727                 memory_object_control_t control;
2728
2729                 /*
2730                  *      Allocate request port.
2731                  */
2732
2733                 control = memory_object_control_allocate(object);
2734                 assert (control != MEMORY_OBJECT_CONTROL_NULL);
2735
2736                 vm_object_lock(object);
2737                 assert(object != kernel_object);
2738
2739                 /*
2740                  *      Copy the reference we were given.
2741                  */
2742
2743                 memory_object_reference(pager);
2744                 object->pager_created = TRUE;
2745                 object->pager = pager;
2746                 object->internal = internal;
2747                 object->pager_trusted = internal;
2748                 if (!internal) {
2749                         /* copy strategy invalid until set by memory manager */
2750                         object->copy_strategy = MEMORY_OBJECT_COPY_INVALID;
2751                 }
2752                 object->pager_control = control;
2753                 object->pager_ready = FALSE;
2754
2755                 vm_object_unlock(object);
2756
2757                 /*
2758                  *      Let the pager know we're using it.
2759                  */
2760
2761                 (void) memory_object_init(pager,
2762                         object->pager_control,
2763                         PAGE_SIZE);
2764
2765                 vm_object_lock(object);
2766                 if (named)
2767                         object->named = TRUE;
2768                 if (internal) {
2769                         object->pager_ready = TRUE;
2770                         vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY);
2771                 }
2772
2773                 object->pager_initialized = TRUE;
2774                 vm_object_wakeup(object, VM_OBJECT_EVENT_INITIALIZED);
2775         } else {
2776                 vm_object_lock(object);
2777         }
2778
2779         /*
2780          *      [At this point, the object must be locked]
2781          */
2782
2783         /*
2784          *      Wait for the work above to be done by the first
2785          *      thread to map this object.
2786          */
2787
2788         while (!object->pager_initialized) {
2789                 vm_object_sleep(object,
2790                                 VM_OBJECT_EVENT_INITIALIZED,
2791                                 THREAD_UNINT);
2792         }
2793         vm_object_unlock(object);
2794
2795         XPR(XPR_VM_OBJECT,
2796             "vm_object_enter: vm_object %x, memory_object %x, internal %d\n",
2797             (integer_t)object, (integer_t)object->pager, internal, 0,0);
2798         return(object);
2799 }
2800
2801 /*
2802  *      Routine:        vm_object_pager_create
2803  *      Purpose:
2804  *              Create a memory object for an internal object.
2805  *      In/out conditions:
2806  *              The object is locked on entry and exit;
2807  *              it may be unlocked within this call.
2808  *      Limitations:
2809  *              Only one thread may be performing a
2810  *              vm_object_pager_create on an object at
2811  *              a time.  Presumably, only the pageout
2812  *              daemon will be using this routine.
2813  */
2814
2815 void
2816 vm_object_pager_create(
2817         register vm_object_t    object)
2818 {
2819         memory_object_t         pager;
2820         vm_object_hash_entry_t  entry;
2821 #if     MACH_PAGEMAP
2822         vm_object_size_t        size;
2823         vm_external_map_t       map;
2824 #endif  /* MACH_PAGEMAP */
2825
2826         XPR(XPR_VM_OBJECT, "vm_object_pager_create, object 0x%X\n",
2827                 (integer_t)object, 0,0,0,0);
2828
2829         assert(object != kernel_object);
2830
2831         if (memory_manager_default_check() != KERN_SUCCESS)
2832                 return;
2833
2834         /*
2835          *      Prevent collapse or termination by holding a paging reference
2836          */
2837
2838         vm_object_paging_begin(object);
2839         if (object->pager_created) {
2840                 /*
2841                  *      Someone else got to it first...
2842                  *      wait for them to finish initializing the ports
2843                  */
2844                 while (!object->pager_initialized) {
2845                         vm_object_sleep(object,
2846                                         VM_OBJECT_EVENT_INITIALIZED,
2847                                         THREAD_UNINT);
2848                 }
2849                 vm_object_paging_end(object);
2850                 return;
2851         }
2852
2853         /*
2854          *      Indicate that a memory object has been assigned
2855          *      before dropping the lock, to prevent a race.
2856          */
2857
2858         object->pager_created = TRUE;
2859         object->paging_offset = 0;
2860
2861 #if     MACH_PAGEMAP
2862         size = object->size;
2863 #endif  /* MACH_PAGEMAP */
2864         vm_object_unlock(object);
2865
2866 #if     MACH_PAGEMAP
2867         map = vm_external_create(size);
2868         vm_object_lock(object);
2869         assert(object->size == size);
2870         object->existence_map = map;
2871         vm_object_unlock(object);
2872 #endif  /* MACH_PAGEMAP */
2873
2874         /*
2875          *      Create the [internal] pager, and associate it with this object.
2876          *
2877          *      We make the association here so that vm_object_enter()
2878          *      can look up the object to complete initializing it.  No
2879          *      user will ever map this object.
2880          */
2881         {
2882                 memory_object_default_t         dmm;
2883                 vm_size_t       cluster_size;
2884
2885                 /* acquire a reference for the default memory manager */
2886                 dmm = memory_manager_default_reference(&cluster_size);
2887                 assert(cluster_size >= PAGE_SIZE);
2888
2889                 object->cluster_size = cluster_size; /* XXX ??? */
2890                 assert(object->temporary);
2891
2892                 /* create our new memory object */
2893                 (void) memory_object_create(dmm, object->size, &pager);
2894
2895                 memory_object_default_deallocate(dmm);
2896        }
2897
2898         entry = vm_object_hash_entry_alloc(pager);
2899
2900         vm_object_cache_lock();
2901         vm_object_hash_insert(entry);
2902
2903         entry->object = object;
2904         vm_object_cache_unlock();
2905
2906         /*
2907          *      A reference was returned by
2908          *      memory_object_create(), and it is
2909          *      copied by vm_object_enter().
2910          */
2911
2912         if (vm_object_enter(pager, object->size, TRUE, TRUE, FALSE) != object)
2913                 panic("vm_object_pager_create: mismatch");
2914
2915         /*
2916          *      Drop the reference we were passed.
2917          */
2918         memory_object_deallocate(pager);
2919
2920         vm_object_lock(object);
2921
2922         /*
2923          *      Release the paging reference
2924          */
2925         vm_object_paging_end(object);
2926 }
2927
2928 /*
2929  *      Routine:        vm_object_remove
2930  *      Purpose:
2931  *              Eliminate the pager/object association
2932  *              for this pager.
2933  *      Conditions:
2934  *              The object cache must be locked.
2935  */
2936 __private_extern__ void
2937 vm_object_remove(
2938         vm_object_t     object)
2939 {
2940         memory_object_t pager;
2941
2942         if ((pager = object->pager) != MEMORY_OBJECT_NULL) {
2943                 vm_object_hash_entry_t  entry;
2944
2945                 entry = vm_object_hash_lookup(pager, FALSE);
2946                 if (entry != VM_OBJECT_HASH_ENTRY_NULL)
2947                         entry->object = VM_OBJECT_NULL;
2948         }
2949
2950 }
2951
2952 /*
2953  *      Global variables for vm_object_collapse():
2954  *
2955  *              Counts for normal collapses and bypasses.
2956  *              Debugging variables, to watch or disable collapse.
2957  */
2958 static long     object_collapses = 0;
2959 static long     object_bypasses  = 0;
2960
2961 static boolean_t        vm_object_collapse_allowed = TRUE;
2962 static boolean_t        vm_object_bypass_allowed = TRUE;
2963
2964 static int      vm_external_discarded;
2965 static int      vm_external_collapsed;
2966
2967 unsigned long vm_object_collapse_encrypted = 0;
2968
2969 /*
2970  *      Routine:        vm_object_do_collapse
2971  *      Purpose:
2972  *              Collapse an object with the object backing it.
2973  *              Pages in the backing object are moved into the
2974  *              parent, and the backing object is deallocated.
2975  *      Conditions:
2976  *              Both objects and the cache are locked; the page
2977  *              queues are unlocked.
2978  *
2979  */
2980 static void
2981 vm_object_do_collapse(
2982         vm_object_t object,
2983         vm_object_t backing_object)
2984 {
2985         vm_page_t p, pp;
2986         vm_object_offset_t new_offset, backing_offset;
2987         vm_object_size_t size;
2988
2989         backing_offset = object->shadow_offset;
2990         size = object->size;
2991
2992         /*
2993          *      Move all in-memory pages from backing_object
2994          *      to the parent.  Pages that have been paged out
2995          *      will be overwritten by any of the parent's
2996          *      pages that shadow them.
2997          */
2998
2999         while (!queue_empty(&backing_object->memq)) {
3000
3001                 p = (vm_page_t) queue_first(&backing_object->memq);
3002
3003                 new_offset = (p->offset - backing_offset);
3004
3005                 assert(!p->busy || p->absent);
3006
3007                 /*
3008                  *      If the parent has a page here, or if
3009                  *      this page falls outside the parent,
3010                  *      dispose of it.
3011                  *
3012                  *      Otherwise, move it as planned.
3013                  */
3014
3015                 if (p->offset < backing_offset || new_offset >= size) {
3016                         VM_PAGE_FREE(p);
3017                 } else {
3018                         /*
3019                          * ENCRYPTED SWAP:
3020                          * The encryption key includes the "pager" and the
3021                          * "paging_offset".  These might not be the same in
3022                          * the new object, so we can't just move an encrypted
3023                          * page from one object to the other.  We can't just
3024                          * decrypt the page here either, because that would drop
3025                          * the object lock.
3026                          * The caller should check for encrypted pages before
3027                          * attempting to collapse.
3028                          */
3029                         ASSERT_PAGE_DECRYPTED(p);
3030
3031                         pp = vm_page_lookup(object, new_offset);
3032                         if (pp == VM_PAGE_NULL) {
3033
3034                                 /*
3035                                  *      Parent now has no page.
3036                                  *      Move the backing object's page up.
3037                                  */
3038
3039                                 vm_page_rename(p, object, new_offset);
3040 #if     MACH_PAGEMAP
3041                         } else if (pp->absent) {
3042
3043                                 /*
3044                                  *      Parent has an absent page...
3045                                  *      it's not being paged in, so
3046                                  *      it must really be missing from
3047                                  *      the parent.
3048                                  *
3049                                  *      Throw out the absent page...
3050                                  *      any faults looking for that
3051                                  *      page will restart with the new
3052                                  *      one.
3053                                  */
3054
3055                                 VM_PAGE_FREE(pp);
3056                                 vm_page_rename(p, object, new_offset);
3057 #endif  /* MACH_PAGEMAP */
3058                         } else {
3059                                 assert(! pp->absent);
3060
3061                                 /*
3062                                  *      Parent object has a real page.
3063                                  *      Throw away the backing object's
3064                                  *      page.
3065                                  */
3066                                 VM_PAGE_FREE(p);
3067                         }
3068                 }
3069         }
3070
3071 #if     !MACH_PAGEMAP
3072         assert(!object->pager_created && object->pager == MEMORY_OBJECT_NULL
3073                 || (!backing_object->pager_created
3074                 &&  backing_object->pager == MEMORY_OBJECT_NULL));
3075 #else
3076         assert(!object->pager_created && object->pager == MEMORY_OBJECT_NULL);
3077 #endif  /* !MACH_PAGEMAP */
3078
3079         if (backing_object->pager != MEMORY_OBJECT_NULL) {
3080                 vm_object_hash_entry_t  entry;
3081
3082                 /*
3083                  *      Move the pager from backing_object to object.
3084                  *
3085                  *      XXX We're only using part of the paging space
3086                  *      for keeps now... we ought to discard the
3087                  *      unused portion.
3088                  */
3089
3090                 assert(!object->paging_in_progress);
3091                 object->pager = backing_object->pager;
3092                 entry = vm_object_hash_lookup(object->pager, FALSE);
3093                 assert(entry != VM_OBJECT_HASH_ENTRY_NULL);
3094                 entry->object = object;
3095                 object->pager_created = backing_object->pager_created;
3096                 object->pager_control = backing_object->pager_control;
3097                 object->pager_ready = backing_object->pager_ready;
3098                 object->pager_initialized = backing_object->pager_initialized;
3099                 object->cluster_size = backing_object->cluster_size;
3100                 object->paging_offset =
3101                     backing_object->paging_offset + backing_offset;
3102                 if (object->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
3103                         memory_object_control_collapse(object->pager_control,
3104                                                        object);
3105                 }
3106         }
3107
3108         vm_object_cache_unlock();
3109
3110 #if     MACH_PAGEMAP
3111         /*
3112          *      If the shadow offset is 0, the use the existence map from
3113          *      the backing object if there is one. If the shadow offset is
3114          *      not zero, toss it.
3115          *
3116          *      XXX - If the shadow offset is not 0 then a bit copy is needed
3117          *      if the map is to be salvaged.  For now, we just just toss the
3118          *      old map, giving the collapsed object no map. This means that
3119          *      the pager is invoked for zero fill pages.  If analysis shows
3120          *      that this happens frequently and is a performance hit, then
3121          *      this code should be fixed to salvage the map.
3122          */
3123         assert(object->existence_map == VM_EXTERNAL_NULL);
3124         if (backing_offset || (size != backing_object->size)) {
3125                 vm_external_discarded++;
3126                 vm_external_destroy(backing_object->existence_map,
3127                         backing_object->size);
3128         }
3129         else {
3130                 vm_external_collapsed++;
3131                 object->existence_map = backing_object->existence_map;
3132         }
3133         backing_object->existence_map = VM_EXTERNAL_NULL;
3134 #endif  /* MACH_PAGEMAP */
3135
3136         /*
3137          *      Object now shadows whatever backing_object did.
3138          *      Note that the reference to backing_object->shadow
3139          *      moves from within backing_object to within object.
3140          */
3141
3142         assert(!object->phys_contiguous);
3143         assert(!backing_object->phys_contiguous);
3144         object->shadow = backing_object->shadow;
3145         if (object->shadow) {
3146                 object->shadow_offset += backing_object->shadow_offset;
3147         } else {
3148                 /* no shadow, therefore no shadow offset... */
3149                 object->shadow_offset = 0;
3150         }
3151         assert((object->shadow == VM_OBJECT_NULL) ||
3152                (object->shadow->copy != backing_object));
3153
3154         /*
3155          *      Discard backing_object.
3156          *
3157          *      Since the backing object has no pages, no
3158          *      pager left, and no object references within it,
3159          *      all that is necessary is to dispose of it.
3160          */
3161
3162         assert((backing_object->ref_count == 1) &&
3163                (backing_object->resident_page_count == 0) &&
3164                (backing_object->paging_in_progress == 0));
3165
3166         backing_object->alive = FALSE;
3167         vm_object_unlock(backing_object);
3168
3169         XPR(XPR_VM_OBJECT, "vm_object_collapse, collapsed 0x%X\n",
3170                 (integer_t)backing_object, 0,0,0,0);
3171
3172         zfree(vm_object_zone, backing_object);
3173
3174         object_collapses++;
3175 }
3176
3177 static void
3178 vm_object_do_bypass(
3179         vm_object_t object,
3180         vm_object_t backing_object)
3181 {
3182         /*
3183          *      Make the parent shadow the next object
3184          *      in the chain.
3185          */
3186
3187 #if     TASK_SWAPPER
3188         /*
3189          *      Do object reference in-line to
3190          *      conditionally increment shadow's
3191          *      residence count.  If object is not
3192          *      resident, leave residence count
3193          *      on shadow alone.
3194          */
3195         if (backing_object->shadow != VM_OBJECT_NULL) {
3196                 vm_object_lock(backing_object->shadow);
3197                 backing_object->shadow->ref_count++;
3198                 if (object->res_count != 0)
3199                         vm_object_res_reference(backing_object->shadow);
3200                 vm_object_unlock(backing_object->shadow);
3201         }
3202 #else   /* TASK_SWAPPER */
3203         vm_object_reference(backing_object->shadow);
3204 #endif  /* TASK_SWAPPER */
3205
3206         assert(!object->phys_contiguous);
3207         assert(!backing_object->phys_contiguous);
3208         object->shadow = backing_object->shadow;
3209         if (object->shadow) {
3210                 object->shadow_offset += backing_object->shadow_offset;
3211         } else {
3212                 /* no shadow, therefore no shadow offset... */
3213                 object->shadow_offset = 0;
3214         }
3215
3216         /*
3217          *      Backing object might have had a copy pointer
3218          *      to us.  If it did, clear it.
3219          */
3220         if (backing_object->copy == object) {
3221                 backing_object->copy = VM_OBJECT_NULL;
3222         }
3223
3224         /*
3225          *      Drop the reference count on backing_object.
3226 #if     TASK_SWAPPER
3227          *      Since its ref_count was at least 2, it
3228          *      will not vanish; so we don't need to call
3229          *      vm_object_deallocate.
3230          *      [FBDP: that doesn't seem to be true any more]
3231          *
3232          *      The res_count on the backing object is
3233          *      conditionally decremented.  It's possible
3234          *      (via vm_pageout_scan) to get here with
3235          *      a "swapped" object, which has a 0 res_count,
3236          *      in which case, the backing object res_count
3237          *      is already down by one.
3238 #else
3239          *      Don't call vm_object_deallocate unless
3240          *      ref_count drops to zero.
3241          *
3242          *      The ref_count can drop to zero here if the
3243          *      backing object could be bypassed but not
3244          *      collapsed, such as when the backing object
3245          *      is temporary and cachable.
3246 #endif
3247          */
3248         if (backing_object->ref_count > 1) {
3249                 backing_object->ref_count--;
3250 #if     TASK_SWAPPER
3251                 if (object->res_count != 0)
3252                         vm_object_res_deallocate(backing_object);
3253                 assert(backing_object->ref_count > 0);
3254 #endif  /* TASK_SWAPPER */
3255                 vm_object_unlock(backing_object);
3256         } else {
3257
3258                 /*
3259                  *      Drop locks so that we can deallocate
3260                  *      the backing object.
3261                  */
3262
3263 #if     TASK_SWAPPER
3264                 if (object->res_count == 0) {
3265                         /* XXX get a reference for the deallocate below */
3266                         vm_object_res_reference(backing_object);
3267                 }
3268 #endif  /* TASK_SWAPPER */
3269                 vm_object_unlock(object);
3270                 vm_object_unlock(backing_object);
3271                 vm_object_deallocate(backing_object);
3272
3273                 /*
3274                  *      Relock object. We don't have to reverify
3275                  *      its state since vm_object_collapse will
3276                  *      do that for us as it starts at the
3277                  *      top of its loop.
3278                  */
3279
3280                 vm_object_lock(object);
3281         }
3282
3283         object_bypasses++;
3284 }
3285
3286
3287 /*
3288  *      vm_object_collapse:
3289  *
3290  *      Perform an object collapse or an object bypass if appropriate.
3291  *      The real work of collapsing and bypassing is performed in
3292  *      the routines vm_object_do_collapse and vm_object_do_bypass.
3293  *
3294  *      Requires that the object be locked and the page queues be unlocked.
3295  *
3296  */
3297 static unsigned long vm_object_collapse_calls = 0;
3298 static unsigned long vm_object_collapse_objects = 0;
3299 static unsigned long vm_object_collapse_do_collapse = 0;
3300 static unsigned long vm_object_collapse_do_bypass = 0;
3301 __private_extern__ void
3302 vm_object_collapse(
3303         register vm_object_t                    object,
3304         register vm_object_offset_t             hint_offset,
3305         boolean_t                               can_bypass)
3306 {
3307         register vm_object_t                    backing_object;
3308         register unsigned int                   rcount;
3309         register unsigned int                   size;
3310         vm_object_offset_t                      collapse_min_offset;
3311         vm_object_offset_t                      collapse_max_offset;
3312         vm_page_t                               page;
3313         vm_object_t                             original_object;
3314
3315         vm_object_collapse_calls++;
3316
3317         if (! vm_object_collapse_allowed &&
3318             ! (can_bypass && vm_object_bypass_allowed)) {
3319                 return;
3320         }
3321
3322         XPR(XPR_VM_OBJECT, "vm_object_collapse, obj 0x%X\n",
3323                 (integer_t)object, 0,0,0,0);
3324
3325         if (object == VM_OBJECT_NULL)
3326                 return;
3327
3328         original_object = object;
3329
3330         while (TRUE) {
3331                 vm_object_collapse_objects++;
3332                 /*
3333                  *      Verify that the conditions are right for either
3334                  *      collapse or bypass:
3335                  */
3336
3337                 /*
3338                  *      There is a backing object, and
3339                  */
3340
3341                 backing_object = object->shadow;
3342                 if (backing_object == VM_OBJECT_NULL) {
3343                         if (object != original_object) {
3344                                 vm_object_unlock(object);
3345                         }
3346                         return;
3347                 }
3348
3349                 /*
3350                  *      No pages in the object are currently
3351                  *      being paged out, and
3352                  */
3353                 if (object->paging_in_progress != 0 ||
3354                     object->absent_count != 0) {
3355                         /* try and collapse the rest of the shadow chain */
3356                         vm_object_lock(backing_object);
3357                         if (object != original_object) {
3358                                 vm_object_unlock(object);
3359                         }
3360                         object = backing_object;
3361                         continue;
3362                 }
3363
3364                 vm_object_lock(backing_object);
3365
3366                 /*
3367                  *      ...
3368                  *              The backing object is not read_only,
3369                  *              and no pages in the backing object are
3370                  *              currently being paged out.
3371                  *              The backing object is internal.
3372                  *
3373                  */
3374
3375                 if (!backing_object->internal ||
3376                     backing_object->paging_in_progress != 0) {
3377                         /* try and collapse the rest of the shadow chain */
3378                         if (object != original_object) {
3379                                 vm_object_unlock(object);
3380                         }
3381                         object = backing_object;
3382                         continue;
3383                 }
3384
3385                 /*
3386                  *      The backing object can't be a copy-object:
3387                  *      the shadow_offset for the copy-object must stay
3388                  *      as 0.  Furthermore (for the 'we have all the
3389                  *      pages' case), if we bypass backing_object and
3390                  *      just shadow the next object in the chain, old
3391                  *      pages from that object would then have to be copied
3392                  *      BOTH into the (former) backing_object and into the
3393                  *      parent object.
3394                  */
3395                 if (backing_object->shadow != VM_OBJECT_NULL &&
3396                     backing_object->shadow->copy == backing_object) {
3397                         /* try and collapse the rest of the shadow chain */
3398                         if (object != original_object) {
3399                                 vm_object_unlock(object);
3400                         }
3401                         object = backing_object;
3402                         continue;
3403                 }
3404
3405                 /*
3406                  *      We can now try to either collapse the backing
3407                  *      object (if the parent is the only reference to
3408                  *      it) or (perhaps) remove the parent's reference
3409                  *      to it.
3410                  *
3411                  *      If there is exactly one reference to the backing
3412                  *      object, we may be able to collapse it into the
3413                  *      parent.
3414                  *
3415                  *      If MACH_PAGEMAP is defined:
3416                  *      The parent must not have a pager created for it,
3417                  *      since collapsing a backing_object dumps new pages
3418                  *      into the parent that its pager doesn't know about
3419                  *      (and the collapse code can't merge the existence
3420                  *      maps).
3421                  *      Otherwise:
3422                  *      As long as one of the objects is still not known
3423                  *      to the pager, we can collapse them.
3424                  */
3425                 if (backing_object->ref_count == 1 &&
3426                     (!object->pager_created
3427 #if     !MACH_PAGEMAP
3428                      || !backing_object->pager_created
3429 #endif  /*!MACH_PAGEMAP */
3430                     ) && vm_object_collapse_allowed) {
3431
3432                         XPR(XPR_VM_OBJECT,
3433                    "vm_object_collapse: %x to %x, pager %x, pager_control %x\n",
3434                                 (integer_t)backing_object, (integer_t)object,
3435                                 (integer_t)backing_object->pager,
3436                                 (integer_t)backing_object->pager_control, 0);
3437
3438                         /*
3439                          *      We need the cache lock for collapsing,
3440                          *      but we must not deadlock.
3441                          */
3442
3443                         if (! vm_object_cache_lock_try()) {
3444                                 if (object != original_object) {
3445                                         vm_object_unlock(object);
3446                                 }
3447                                 vm_object_unlock(backing_object);
3448                                 return;
3449                         }
3450
3451                         /*
3452                          * ENCRYPTED SWAP
3453                          * We can't collapse the object if it contains
3454                          * any encypted page, because the encryption key
3455                          * includes the <object,offset> info.  We can't
3456                          * drop the object lock in vm_object_do_collapse()
3457                          * so we can't decrypt the page there either.
3458                          */
3459                         if (vm_pages_encrypted) {
3460                                 collapse_min_offset = object->shadow_offset;
3461                                 collapse_max_offset =
3462                                         object->shadow_offset + object->size;
3463                                 queue_iterate(&backing_object->memq,
3464                                               page, vm_page_t, listq) {
3465                                         if (page->encrypted &&
3466                                             (page->offset >=
3467                                              collapse_min_offset) &&
3468                                             (page->offset <
3469                                              collapse_max_offset)) {
3470                                                 /*
3471                                                  * We found an encrypted page
3472                                                  * in the backing object,
3473                                                  * within the range covered
3474                                                  * by the parent object: we can
3475                                                  * not collapse them.
3476                                                  */
3477                                                 vm_object_collapse_encrypted++;
3478                                                 vm_object_cache_unlock();
3479                                                 goto try_bypass;
3480                                         }
3481                                 }
3482                         }
3483
3484                         /*
3485                          *      Collapse the object with its backing
3486                          *      object, and try again with the object's
3487                          *      new backing object.
3488                          */
3489
3490                         vm_object_do_collapse(object, backing_object);
3491                         vm_object_collapse_do_collapse++;
3492                         continue;
3493                 }
3494
3495         try_bypass:
3496                 /*
3497                  *      Collapsing the backing object was not possible
3498                  *      or permitted, so let's try bypassing it.
3499                  */
3500
3501                 if (! (can_bypass && vm_object_bypass_allowed)) {
3502                         /* try and collapse the rest of the shadow chain */
3503                         if (object != original_object) {
3504                                 vm_object_unlock(object);
3505                         }
3506                         object = backing_object;
3507                         continue;
3508                 }
3509
3510
3511                 /*
3512                  *      If the object doesn't have all its pages present,
3513                  *      we have to make sure no pages in the backing object
3514                  *      "show through" before bypassing it.
3515                  */
3516                 size = atop(object->size);
3517                 rcount = object->resident_page_count;
3518                 if (rcount != size) {
3519                         vm_object_offset_t      offset;
3520                         vm_object_offset_t      backing_offset;
3521                         unsigned int            backing_rcount;
3522                         unsigned int            lookups = 0;
3523
3524                         /*
3525                          *      If the backing object has a pager but no pagemap,
3526                          *      then we cannot bypass it, because we don't know
3527                          *      what pages it has.
3528                          */
3529                         if (backing_object->pager_created
3530 #if     MACH_PAGEMAP
3531                                 && (backing_object->existence_map == VM_EXTERNAL_NULL)
3532 #endif  /* MACH_PAGEMAP */
3533                                 ) {
3534                                 /* try and collapse the rest of the shadow chain */
3535                                 if (object != original_object) {
3536                                         vm_object_unlock(object);
3537                                 }
3538                                 object = backing_object;
3539                                 continue;
3540                         }
3541
3542                         /*
3543                          *      If the object has a pager but no pagemap,
3544                          *      then we cannot bypass it, because we don't know
3545                          *      what pages it has.
3546                          */
3547                         if (object->pager_created
3548 #if     MACH_PAGEMAP
3549                                 && (object->existence_map == VM_EXTERNAL_NULL)
3550 #endif  /* MACH_PAGEMAP */
3551                                 ) {
3552                                 /* try and collapse the rest of the shadow chain */
3553                                 if (object != original_object) {
3554                                         vm_object_unlock(object);
3555                                 }
3556                                 object = backing_object;
3557                                 continue;
3558                         }
3559
3560                         /*
3561                          *      If all of the pages in the backing object are
3562                          *      shadowed by the parent object, the parent
3563                          *      object no longer has to shadow the backing
3564                          *      object; it can shadow the next one in the
3565                          *      chain.
3566                          *
3567                          *      If the backing object has existence info,
3568                          *      we must check examine its existence info
3569                          *      as well.
3570                          *
3571                          */
3572
3573                         backing_offset = object->shadow_offset;
3574                         backing_rcount = backing_object->resident_page_count;
3575
3576 #define EXISTS_IN_OBJECT(obj, off, rc) \
3577         (vm_external_state_get((obj)->existence_map, \
3578          (vm_offset_t)(off)) == VM_EXTERNAL_STATE_EXISTS || \
3579          ((rc) && ++lookups && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--))
3580
3581                         /*
3582                          * Check the hint location first
3583                          * (since it is often the quickest way out of here).
3584                          */
3585                         if (object->cow_hint != ~(vm_offset_t)0)
3586                                 hint_offset = (vm_object_offset_t)object->cow_hint;
3587                         else
3588                                 hint_offset = (hint_offset > 8 * PAGE_SIZE_64) ?
3589                                               (hint_offset - 8 * PAGE_SIZE_64) : 0;
3590
3591                         if (EXISTS_IN_OBJECT(backing_object, hint_offset +
3592                                              backing_offset, backing_rcount) &&
3593                             !EXISTS_IN_OBJECT(object, hint_offset, rcount)) {
3594                                 /* dependency right at the hint */
3595                                 object->cow_hint = (vm_offset_t)hint_offset;
3596                                 /* try and collapse the rest of the shadow chain */
3597                                 if (object != original_object) {
3598                                         vm_object_unlock(object);
3599                                 }
3600                                 object = backing_object;
3601                                 continue;
3602                         }
3603
3604                         /*
3605                          * If the object's window onto the backing_object
3606                          * is large compared to the number of resident
3607                          * pages in the backing object, it makes sense to
3608                          * walk the backing_object's resident pages first.
3609                          *
3610                          * NOTE: Pages may be in both the existence map and
3611                          * resident.  So, we can't permanently decrement
3612                          * the rcount here because the second loop may
3613                          * find the same pages in the backing object'
3614                          * existence map that we found here and we would
3615                          * double-decrement the rcount.  We also may or
3616                          * may not have found the
3617                          */
3618                         if (backing_rcount && size >
3619                             ((backing_object->existence_map) ?
3620                              backing_rcount : (backing_rcount >> 1))) {
3621                                 unsigned int rc = rcount;
3622                                 vm_page_t p;
3623
3624                                 backing_rcount = backing_object->resident_page_count;
3625                                 p = (vm_page_t)queue_first(&backing_object->memq);
3626                                 do {
3627                                         /* Until we get more than one lookup lock */
3628                                         if (lookups > 256) {
3629                                                 lookups = 0;
3630                                                 delay(1);
3631                                         }
3632
3633                                         offset = (p->offset - backing_offset);
3634                                         if (offset < object->size &&
3635                                             offset != hint_offset &&
3636                                             !EXISTS_IN_OBJECT(object, offset, rc)) {
3637                                                 /* found a dependency */
3638                                                 object->cow_hint = (vm_offset_t)offset;
3639                                                 break;
3640                                         }
3641                                         p = (vm_page_t) queue_next(&p->listq);
3642
3643                                 } while (--backing_rcount);
3644                                 if (backing_rcount != 0 ) {
3645                                         /* try and collapse the rest of the shadow chain */
3646                                         if (object != original_object) {
3647                                                 vm_object_unlock(object);
3648                                         }
3649                                         object = backing_object;
3650                                         continue;
3651                                 }
3652                         }
3653
3654                         /*
3655                          * Walk through the offsets looking for pages in the
3656                          * backing object that show through to the object.
3657                          */
3658                         if (backing_rcount || backing_object->existence_map) {
3659                                 offset = hint_offset;
3660
3661                                 while((offset =
3662                                       (offset + PAGE_SIZE_64 < object->size) ?
3663                                       (offset + PAGE_SIZE_64) : 0) != hint_offset) {
3664
3665                                         /* Until we get more than one lookup lock */
3666                                         if (lookups > 256) {
3667                                                 lookups = 0;
3668                                                 delay(1);
3669                                         }
3670
3671                                         if (EXISTS_IN_OBJECT(backing_object, offset +
3672                                             backing_offset, backing_rcount) &&
3673                                             !EXISTS_IN_OBJECT(object, offset, rcount)) {
3674                                                 /* found a dependency */
3675                                                 object->cow_hint = (vm_offset_t)offset;
3676                                                 break;
3677                                         }
3678                                 }
3679                                 if (offset != hint_offset) {
3680                                         /* try and collapse the rest of the shadow chain */
3681                                         if (object != original_object) {
3682                                                 vm_object_unlock(object);
3683                                         }
3684                                         object = backing_object;
3685                                         continue;
3686                                 }
3687                         }
3688                 }
3689
3690                 /* reset the offset hint for any objects deeper in the chain */
3691                 object->cow_hint = (vm_offset_t)0;
3692
3693                 /*
3694                  *      All interesting pages in the backing object
3695                  *      already live in the parent or its pager.
3696                  *      Thus we can bypass the backing object.
3697                  */
3698
3699                 vm_object_do_bypass(object, backing_object);
3700                 vm_object_collapse_do_bypass++;
3701
3702                 /*
3703                  *      Try again with this object's new backing object.
3704                  */
3705
3706                 continue;
3707         }
3708
3709         if (object != original_object) {
3710                 vm_object_unlock(object);
3711         }
3712 }
3713
3714 /*
3715  *      Routine:        vm_object_page_remove: [internal]
3716  *      Purpose:
3717  *              Removes all physical pages in the specified
3718  *              object range from the object's list of pages.
3719  *
3720  *      In/out conditions:
3721  *              The object must be locked.
3722  *              The object must not have paging_in_progress, usually
3723  *              guaranteed by not having a pager.
3724  */
3725 unsigned int vm_object_page_remove_lookup = 0;
3726 unsigned int vm_object_page_remove_iterate = 0;
3727
3728 __private_extern__ void
3729 vm_object_page_remove(
3730         register vm_object_t            object,
3731         register vm_object_offset_t     start,
3732         register vm_object_offset_t     end)
3733 {
3734         register vm_page_t      p, next;
3735
3736         /*
3737          *      One and two page removals are most popular.
3738          *      The factor of 16 here is somewhat arbitrary.
3739          *      It balances vm_object_lookup vs iteration.
3740          */
3741
3742         if (atop_64(end - start) < (unsigned)object->resident_page_count/16) {
3743                 vm_object_page_remove_lookup++;
3744
3745                 for (; start < end; start += PAGE_SIZE_64) {
3746                         p = vm_page_lookup(object, start);
3747                         if (p != VM_PAGE_NULL) {
3748                                 assert(!p->cleaning && !p->pageout);
3749                                 if (!p->fictitious)
3750                                         pmap_disconnect(p->phys_page);
3751                                 VM_PAGE_FREE(p);
3752                         }
3753                 }
3754         } else {
3755                 vm_object_page_remove_iterate++;
3756
3757                 p = (vm_page_t) queue_first(&object->memq);
3758                 while (!queue_end(&object->memq, (queue_entry_t) p)) {
3759                         next = (vm_page_t) queue_next(&p->listq);
3760                         if ((start <= p->offset) && (p->offset < end)) {
3761                                 assert(!p->cleaning && !p->pageout);
3762                                 if (!p->fictitious)
3763                                         pmap_disconnect(p->phys_page);
3764                                 VM_PAGE_FREE(p);
3765                         }
3766                         p = next;
3767                 }
3768         }
3769 }
3770
3771
3772 /*
3773  *      Routine:        vm_object_coalesce
3774  *      Function:       Coalesces two objects backing up adjoining
3775  *                      regions of memory into a single object.
3776  *
3777  *      returns TRUE if objects were combined.
3778  *
3779  *      NOTE:   Only works at the moment if the second object is NULL -
3780  *              if it's not, which object do we lock first?
3781  *
3782  *      Parameters:
3783  *              prev_object     First object to coalesce
3784  *              prev_offset     Offset into prev_object
3785  *              next_object     Second object into coalesce
3786  *              next_offset     Offset into next_object
3787  *
3788  *              prev_size       Size of reference to prev_object
3789  *              next_size       Size of reference to next_object
3790  *
3791  *      Conditions:
3792  *      The object(s) must *not* be locked. The map must be locked
3793  *      to preserve the reference to the object(s).
3794  */
3795 static int vm_object_coalesce_count = 0;
3796
3797 __private_extern__ boolean_t
3798 vm_object_coalesce(
3799         register vm_object_t            prev_object,
3800         vm_object_t                     next_object,
3801         vm_object_offset_t              prev_offset,
3802         __unused vm_object_offset_t next_offset,
3803         vm_object_size_t                prev_size,
3804         vm_object_size_t                next_size)
3805 {
3806         vm_object_size_t        newsize;
3807
3808 #ifdef  lint
3809         next_offset++;
3810 #endif  /* lint */
3811
3812         if (next_object != VM_OBJECT_NULL) {
3813                 return(FALSE);
3814         }
3815
3816         if (prev_object == VM_OBJECT_NULL) {
3817                 return(TRUE);
3818         }
3819
3820         XPR(XPR_VM_OBJECT,
3821        "vm_object_coalesce: 0x%X prev_off 0x%X prev_size 0x%X next_size 0x%X\n",
3822                 (integer_t)prev_object, prev_offset, prev_size, next_size, 0);
3823
3824         vm_object_lock(prev_object);
3825
3826         /*
3827          *      Try to collapse the object first
3828          */
3829         vm_object_collapse(prev_object, prev_offset, TRUE);
3830
3831         /*
3832          *      Can't coalesce if pages not mapped to
3833          *      prev_entry may be in use any way:
3834          *      . more than one reference
3835          *      . paged out
3836          *      . shadows another object
3837          *      . has a copy elsewhere
3838          *      . is purgable
3839          *      . paging references (pages might be in page-list)
3840          */
3841
3842         if ((prev_object->ref_count > 1) ||
3843             prev_object->pager_created ||
3844             (prev_object->shadow != VM_OBJECT_NULL) ||
3845             (prev_object->copy != VM_OBJECT_NULL) ||
3846             (prev_object->true_share != FALSE) ||
3847             (prev_object->purgable != VM_OBJECT_NONPURGABLE) ||
3848             (prev_object->paging_in_progress != 0)) {
3849                 vm_object_unlock(prev_object);
3850                 return(FALSE);
3851         }
3852
3853         vm_object_coalesce_count++;
3854
3855         /*
3856          *      Remove any pages that may still be in the object from
3857          *      a previous deallocation.
3858          */
3859         vm_object_page_remove(prev_object,
3860                 prev_offset + prev_size,
3861                 prev_offset + prev_size + next_size);
3862
3863         /*
3864          *      Extend the object if necessary.
3865          */
3866         newsize = prev_offset + prev_size + next_size;
3867         if (newsize > prev_object->size) {
3868 #if     MACH_PAGEMAP
3869                 /*
3870                  *      We cannot extend an object that has existence info,
3871                  *      since the existence info might then fail to cover
3872                  *      the entire object.
3873                  *
3874                  *      This assertion must be true because the object
3875                  *      has no pager, and we only create existence info
3876                  *      for objects with pagers.
3877                  */
3878                 assert(prev_object->existence_map == VM_EXTERNAL_NULL);
3879 #endif  /* MACH_PAGEMAP */
3880                 prev_object->size = newsize;
3881         }
3882
3883         vm_object_unlock(prev_object);
3884         return(TRUE);
3885 }
3886
3887 /*
3888  *      Attach a set of physical pages to an object, so that they can
3889  *      be mapped by mapping the object.  Typically used to map IO memory.
3890  *
3891  *      The mapping function and its private data are used to obtain the
3892  *      physical addresses for each page to be mapped.
3893  */
3894 void
3895 vm_object_page_map(
3896         vm_object_t             object,
3897         vm_object_offset_t      offset,
3898         vm_object_size_t        size,
3899         vm_object_offset_t      (*map_fn)(void *map_fn_data,
3900                 vm_object_offset_t offset),
3901                 void            *map_fn_data)   /* private to map_fn */
3902 {
3903         int     num_pages;
3904         int     i;
3905         vm_page_t       m;
3906         vm_page_t       old_page;
3907         vm_object_offset_t      addr;
3908
3909         num_pages = atop_64(size);
3910
3911         for (i = 0; i < num_pages; i++, offset += PAGE_SIZE_64) {
3912
3913             addr = (*map_fn)(map_fn_data, offset);
3914
3915             while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL)
3916                 vm_page_more_fictitious();
3917
3918             vm_object_lock(object);
3919             if ((old_page = vm_page_lookup(object, offset))
3920                         != VM_PAGE_NULL)
3921             {
3922                 vm_page_lock_queues();
3923                 vm_page_free(old_page);
3924                 vm_page_unlock_queues();
3925             }
3926
3927             vm_page_init(m, addr);
3928             /* private normally requires lock_queues but since we */
3929             /* are initializing the page, its not necessary here  */
3930             m->private = TRUE;          /* don`t free page */
3931             m->wire_count = 1;
3932             vm_page_insert(m, object, offset);
3933
3934             PAGE_WAKEUP_DONE(m);
3935             vm_object_unlock(object);
3936         }
3937 }
3938
3939 #include <mach_kdb.h>
3940
3941 #if     MACH_KDB
3942 #include <ddb/db_output.h>
3943 #include <vm/vm_print.h>
3944
3945 #define printf  kdbprintf
3946
3947 extern boolean_t        vm_object_cached(
3948                                 vm_object_t object);
3949
3950 extern void             print_bitstring(
3951                                 char byte);
3952
3953 boolean_t       vm_object_print_pages = FALSE;
3954
3955 void
3956 print_bitstring(
3957         char byte)
3958 {
3959         printf("%c%c%c%c%c%c%c%c",
3960                ((byte & (1 << 0)) ? '1' : '0'),
3961                ((byte & (1 << 1)) ? '1' : '0'),
3962                ((byte & (1 << 2)) ? '1' : '0'),
3963                ((byte & (1 << 3)) ? '1' : '0'),
3964                ((byte & (1 << 4)) ? '1' : '0'),
3965                ((byte & (1 << 5)) ? '1' : '0'),
3966                ((byte & (1 << 6)) ? '1' : '0'),
3967                ((byte & (1 << 7)) ? '1' : '0'));
3968 }
3969
3970 boolean_t
3971 vm_object_cached(
3972         register vm_object_t object)
3973 {
3974         register vm_object_t o;
3975
3976         queue_iterate(&vm_object_cached_list, o, vm_object_t, cached_list) {
3977                 if (object == o) {
3978                         return TRUE;
3979                 }
3980         }
3981         return FALSE;
3982 }
3983
3984 #if     MACH_PAGEMAP
3985 /*
3986  *      vm_external_print:      [ debug ]
3987  */
3988 void
3989 vm_external_print(
3990         vm_external_map_t       emap,
3991         vm_size_t               size)
3992 {
3993         if (emap == VM_EXTERNAL_NULL) {
3994                 printf("0  ");
3995         } else {
3996                 vm_size_t existence_size = stob(size);
3997                 printf("{ size=%d, map=[", existence_size);
3998                 if (existence_size > 0) {
3999                         print_bitstring(emap[0]);
4000                 }
4001                 if (existence_size > 1) {
4002                         print_bitstring(emap[1]);
4003                 }
4004                 if (existence_size > 2) {
4005                         printf("...");
4006                         print_bitstring(emap[existence_size-1]);
4007                 }
4008                 printf("] }\n");
4009         }
4010         return;
4011 }
4012 #endif  /* MACH_PAGEMAP */
4013
4014 int
4015 vm_follow_object(
4016         vm_object_t object)
4017 {
4018         int count = 0;
4019         int orig_db_indent = db_indent;
4020
4021         while (TRUE) {
4022                 if (object == VM_OBJECT_NULL) {
4023                         db_indent = orig_db_indent;
4024                         return count;
4025                 }
4026
4027                 count += 1;
4028
4029                 iprintf("object 0x%x", object);
4030                 printf(", shadow=0x%x", object->shadow);
4031                 printf(", copy=0x%x", object->copy);
4032                 printf(", pager=0x%x", object->pager);
4033                 printf(", ref=%d\n", object->ref_count);
4034
4035                 db_indent += 2;
4036                 object = object->shadow;
4037         }
4038
4039 }
4040
4041 /*
4042  *      vm_object_print:        [ debug ]
4043  */
4044 void
4045 vm_object_print(
4046         db_addr_t       db_addr,
4047         __unused boolean_t      have_addr,
4048         __unused int            arg_count,
4049         __unused char           *modif)
4050 {
4051         vm_object_t     object;
4052         register vm_page_t p;
4053         const char *s;
4054
4055         register int count;
4056
4057         object = (vm_object_t) (long) db_addr;
4058         if (object == VM_OBJECT_NULL)
4059                 return;
4060
4061         iprintf("object 0x%x\n", object);
4062
4063         db_indent += 2;
4064
4065         iprintf("size=0x%x", object->size);
4066         printf(", cluster=0x%x", object->cluster_size);
4067         printf(", memq_hint=%p", object->memq_hint);
4068         printf(", ref_count=%d\n", object->ref_count);
4069         iprintf("");
4070 #if     TASK_SWAPPER
4071         printf("res_count=%d, ", object->res_count);
4072 #endif  /* TASK_SWAPPER */
4073         printf("resident_page_count=%d\n", object->resident_page_count);
4074
4075         iprintf("shadow=0x%x", object->shadow);
4076         if (object->shadow) {
4077                 register int i = 0;
4078                 vm_object_t shadow = object;
4079                 while((shadow = shadow->shadow))
4080                         i++;
4081                 printf(" (depth %d)", i);
4082         }
4083         printf(", copy=0x%x", object->copy);
4084         printf(", shadow_offset=0x%x", object->shadow_offset);
4085         printf(", last_alloc=0x%x\n", object->last_alloc);
4086
4087         iprintf("pager=0x%x", object->pager);
4088         printf(", paging_offset=0x%x", object->paging_offset);
4089         printf(", pager_control=0x%x\n", object->pager_control);
4090
4091         iprintf("copy_strategy=%d[", object->copy_strategy);
4092         switch (object->copy_strategy) {
4093                 case MEMORY_OBJECT_COPY_NONE:
4094                 printf("copy_none");
4095                 break;
4096
4097                 case MEMORY_OBJECT_COPY_CALL:
4098                 printf("copy_call");
4099                 break;
4100
4101                 case MEMORY_OBJECT_COPY_DELAY:
4102                 printf("copy_delay");
4103                 break;
4104
4105                 case MEMORY_OBJECT_COPY_SYMMETRIC:
4106                 printf("copy_symmetric");
4107                 break;
4108
4109                 case MEMORY_OBJECT_COPY_INVALID:
4110                 printf("copy_invalid");
4111                 break;
4112
4113                 default:
4114                 printf("?");
4115         }
4116         printf("]");
4117         printf(", absent_count=%d\n", object->absent_count);
4118
4119         iprintf("all_wanted=0x%x<", object->all_wanted);
4120         s = "";
4121         if (vm_object_wanted(object, VM_OBJECT_EVENT_INITIALIZED)) {
4122                 printf("%sinit", s);
4123                 s = ",";
4124         }
4125         if (vm_object_wanted(object, VM_OBJECT_EVENT_PAGER_READY)) {
4126                 printf("%sready", s);
4127                 s = ",";
4128         }
4129         if (vm_object_wanted(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS)) {
4130                 printf("%spaging", s);
4131                 s = ",";
4132         }
4133         if (vm_object_wanted(object, VM_OBJECT_EVENT_ABSENT_COUNT)) {
4134                 printf("%sabsent", s);
4135                 s = ",";
4136         }
4137         if (vm_object_wanted(object, VM_OBJECT_EVENT_LOCK_IN_PROGRESS)) {
4138                 printf("%slock", s);
4139                 s = ",";
4140         }
4141         if (vm_object_wanted(object, VM_OBJECT_EVENT_UNCACHING)) {
4142                 printf("%suncaching", s);
4143                 s = ",";
4144         }
4145         if (vm_object_wanted(object, VM_OBJECT_EVENT_COPY_CALL)) {
4146                 printf("%scopy_call", s);
4147                 s = ",";
4148         }
4149         if (vm_object_wanted(object, VM_OBJECT_EVENT_CACHING)) {
4150                 printf("%scaching", s);
4151                 s = ",";
4152         }
4153         printf(">");
4154         printf(", paging_in_progress=%d\n", object->paging_in_progress);
4155
4156         iprintf("%screated, %sinit, %sready, %spersist, %strusted, %spageout, %s, %s\n",
4157                 (object->pager_created ? "" : "!"),
4158                 (object->pager_initialized ? "" : "!"),
4159                 (object->pager_ready ? "" : "!"),
4160                 (object->can_persist ? "" : "!"),
4161                 (object->pager_trusted ? "" : "!"),
4162                 (object->pageout ? "" : "!"),
4163                 (object->internal ? "internal" : "external"),
4164                 (object->temporary ? "temporary" : "permanent"));
4165         iprintf("%salive, %spurgable, %spurgable_volatile, %spurgable_empty, %sshadowed, %scached, %sprivate\n",
4166                 (object->alive ? "" : "!"),
4167                 ((object->purgable != VM_OBJECT_NONPURGABLE) ? "" : "!"),
4168                 ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE) ? "" : "!"),
4169                 ((object->purgable == VM_OBJECT_PURGABLE_EMPTY) ? "" : "!"),
4170                 (object->shadowed ? "" : "!"),
4171                 (vm_object_cached(object) ? "" : "!"),
4172                 (object->private ? "" : "!"));
4173         iprintf("%sadvisory_pageout, %ssilent_overwrite\n",
4174                 (object->advisory_pageout ? "" : "!"),
4175                 (object->silent_overwrite ? "" : "!"));
4176
4177 #if     MACH_PAGEMAP
4178         iprintf("existence_map=");
4179         vm_external_print(object->existence_map, object->size);
4180 #endif  /* MACH_PAGEMAP */
4181 #if     MACH_ASSERT
4182         iprintf("paging_object=0x%x\n", object->paging_object);
4183 #endif  /* MACH_ASSERT */
4184
4185         if (vm_object_print_pages) {
4186                 count = 0;
4187                 p = (vm_page_t) queue_first(&object->memq);
4188                 while (!queue_end(&object->memq, (queue_entry_t) p)) {
4189                         if (count == 0) {
4190                                 iprintf("memory:=");
4191                         } else if (count == 2) {
4192                                 printf("\n");
4193                                 iprintf(" ...");
4194                                 count = 0;
4195                         } else {
4196                                 printf(",");
4197                         }
4198                         count++;
4199
4200                         printf("(off=0x%llX,page=%p)", p->offset, p);
4201                         p = (vm_page_t) queue_next(&p->listq);
4202                 }
4203                 if (count != 0) {
4204                         printf("\n");
4205                 }
4206         }
4207         db_indent -= 2;
4208 }
4209
4210
4211 /*
4212  *      vm_object_find          [ debug ]
4213  *
4214  *      Find all tasks which reference the given vm_object.
4215  */
4216
4217 boolean_t vm_object_find(vm_object_t object);
4218 boolean_t vm_object_print_verbose = FALSE;
4219
4220 boolean_t
4221 vm_object_find(
4222         vm_object_t     object)
4223 {
4224         task_t task;
4225         vm_map_t map;
4226         vm_map_entry_t entry;
4227         processor_set_t pset = &default_pset;
4228         boolean_t found = FALSE;
4229
4230         queue_iterate(&pset->tasks, task, task_t, pset_tasks) {
4231                 map = task->map;
4232                 for (entry = vm_map_first_entry(map);
4233                          entry && entry != vm_map_to_entry(map);
4234                          entry = entry->vme_next) {
4235
4236                         vm_object_t obj;
4237
4238                         /*
4239                          * For the time being skip submaps,
4240                          * only the kernel can have submaps,
4241                          * and unless we are interested in
4242                          * kernel objects, we can simply skip
4243                          * submaps. See sb/dejan/nmk18b7/src/mach_kernel/vm
4244                          * for a full solution.
4245                          */
4246                         if (entry->is_sub_map)
4247                                 continue;
4248                         if (entry)
4249                                 obj = entry->object.vm_object;
4250                         else
4251                                 continue;
4252
4253                         while (obj != VM_OBJECT_NULL) {
4254                                 if (obj == object) {
4255                                         if (!found) {
4256                                                 printf("TASK\t\tMAP\t\tENTRY\n");
4257                                                 found = TRUE;
4258                                         }
4259                                         printf("0x%x\t0x%x\t0x%x\n",
4260                                                    task, map, entry);
4261                                 }
4262                                 obj = obj->shadow;
4263                         }
4264                 }
4265         }
4266
4267         return(found);
4268 }
4269
4270 #endif  /* MACH_KDB */
4271
4272 kern_return_t
4273 vm_object_populate_with_private(
4274                 vm_object_t             object,
4275                 vm_object_offset_t      offset,
4276                 ppnum_t                 phys_page,
4277                 vm_size_t               size)
4278 {
4279         ppnum_t                 base_page;
4280         vm_object_offset_t      base_offset;
4281
4282
4283         if(!object->private)
4284                 return KERN_FAILURE;
4285
4286         base_page = phys_page;
4287
4288         vm_object_lock(object);
4289         if(!object->phys_contiguous) {
4290                 vm_page_t       m;
4291                 if((base_offset = trunc_page_64(offset)) != offset) {
4292                         vm_object_unlock(object);
4293                         return KERN_FAILURE;
4294                 }
4295                 base_offset += object->paging_offset;
4296                 while(size) {
4297                         m = vm_page_lookup(object, base_offset);
4298                         if(m != VM_PAGE_NULL) {
4299                                 if(m->fictitious) {
4300                                         vm_page_lock_queues();
4301                                         m->fictitious = FALSE;
4302                                         m->private = TRUE;
4303                                         m->phys_page = base_page;
4304                                         if(!m->busy) {
4305                                                 m->busy = TRUE;
4306                                         }
4307                                         if(!m->absent) {
4308                                                 m->absent = TRUE;
4309                                                 object->absent_count++;
4310                                         }
4311                                         m->list_req_pending = TRUE;
4312                                         vm_page_unlock_queues();
4313                                 } else if (m->phys_page != base_page) {
4314                                         /* pmap call to clear old mapping */
4315                                         pmap_disconnect(m->phys_page);
4316                                         m->phys_page = base_page;
4317                                 }
4318
4319                                 /*
4320                                  * ENCRYPTED SWAP:
4321                                  * We're not pointing to the same
4322                                  * physical page any longer and the
4323                                  * contents of the new one are not
4324                                  * supposed to be encrypted.
4325                                  * XXX What happens to the original
4326                                  * physical page. Is it lost ?
4327                                  */
4328                                 m->encrypted = FALSE;
4329
4330                         } else {
4331                                 while ((m = vm_page_grab_fictitious())
4332                                                          == VM_PAGE_NULL)
4333                                         vm_page_more_fictitious();
4334                                 vm_page_lock_queues();
4335                                 m->fictitious = FALSE;
4336                                 m->private = TRUE;
4337                                 m->phys_page = base_page;
4338                                 m->list_req_pending = TRUE;
4339                                 m->absent = TRUE;
4340                                 m->unusual = TRUE;
4341                                 object->absent_count++;
4342                                 vm_page_unlock_queues();
4343                                 vm_page_insert(m, object, base_offset);
4344                         }
4345                         base_page++;                                                                    /* Go to the next physical page */
4346                         base_offset += PAGE_SIZE;
4347                         size -= PAGE_SIZE;
4348                 }
4349         } else {
4350                 /* NOTE: we should check the original settings here */
4351                 /* if we have a size > zero a pmap call should be made */
4352                 /* to disable the range */
4353
4354                 /* pmap_? */
4355
4356                 /* shadows on contiguous memory are not allowed */
4357                 /* we therefore can use the offset field */
4358                 object->shadow_offset = (vm_object_offset_t)(phys_page << 12);
4359                 object->size = size;
4360         }
4361         vm_object_unlock(object);
4362         return KERN_SUCCESS;
4363 }
4364
4365 /*
4366  *      memory_object_free_from_cache:
4367  *
4368  *      Walk the vm_object cache list, removing and freeing vm_objects
4369  *      which are backed by the pager identified by the caller, (pager_ops).
4370  *      Remove up to "count" objects, if there are that may available
4371  *      in the cache.
4372  *
4373  *      Walk the list at most once, return the number of vm_objects
4374  *      actually freed.
4375  */
4376
4377 __private_extern__ kern_return_t
4378 memory_object_free_from_cache(
4379         __unused host_t         host,
4380         memory_object_pager_ops_t pager_ops,
4381         int             *count)
4382 {
4383
4384         int     object_released = 0;
4385
4386         register vm_object_t object = VM_OBJECT_NULL;
4387         vm_object_t shadow;
4388
4389 /*
4390         if(host == HOST_NULL)
4391                 return(KERN_INVALID_ARGUMENT);
4392 */
4393
4394  try_again:
4395         vm_object_cache_lock();
4396
4397         queue_iterate(&vm_object_cached_list, object,
4398                                         vm_object_t, cached_list) {
4399                 if (object->pager &&
4400                     (pager_ops == object->pager->mo_pager_ops)) {
4401                         vm_object_lock(object);
4402                         queue_remove(&vm_object_cached_list, object,
4403                                         vm_object_t, cached_list);
4404                         vm_object_cached_count--;
4405
4406                         /*
4407                         *       Since this object is in the cache, we know
4408                         *       that it is initialized and has only a pager's
4409                         *       (implicit) reference. Take a reference to avoid
4410                         *       recursive deallocations.
4411                         */
4412
4413                         assert(object->pager_initialized);
4414                         assert(object->ref_count == 0);
4415                         object->ref_count++;
4416
4417                         /*
4418                         *       Terminate the object.
4419                         *       If the object had a shadow, we let
4420                         *       vm_object_deallocate deallocate it.
4421                         *       "pageout" objects have a shadow, but
4422                         *       maintain a "paging reference" rather
4423                         *       than a normal reference.
4424                         *       (We are careful here to limit recursion.)
4425                         */
4426                         shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
4427                         if ((vm_object_terminate(object) == KERN_SUCCESS)
4428                                         && (shadow != VM_OBJECT_NULL)) {
4429                                 vm_object_deallocate(shadow);
4430                         }
4431
4432                         if(object_released++ == *count)
4433                                 return KERN_SUCCESS;
4434                         goto try_again;
4435                 }
4436         }
4437         vm_object_cache_unlock();
4438         *count  = object_released;
4439         return KERN_SUCCESS;
4440 }
4441
4442
4443
4444 kern_return_t
4445 memory_object_create_named(
4446         memory_object_t pager,
4447         memory_object_offset_t  size,
4448         memory_object_control_t         *control)
4449 {
4450         vm_object_t             object;
4451         vm_object_hash_entry_t  entry;
4452
4453         *control = MEMORY_OBJECT_CONTROL_NULL;
4454         if (pager == MEMORY_OBJECT_NULL)
4455                 return KERN_INVALID_ARGUMENT;
4456
4457         vm_object_cache_lock();
4458         entry = vm_object_hash_lookup(pager, FALSE);
4459         if ((entry != VM_OBJECT_HASH_ENTRY_NULL) &&
4460                         (entry->object != VM_OBJECT_NULL)) {
4461                 if (entry->object->named == TRUE)
4462                         panic("memory_object_create_named: caller already holds the right");    }
4463
4464         vm_object_cache_unlock();
4465         if ((object = vm_object_enter(pager, size, FALSE, FALSE, TRUE))
4466             == VM_OBJECT_NULL) {
4467                 return(KERN_INVALID_OBJECT);
4468         }
4469
4470         /* wait for object (if any) to be ready */
4471         if (object != VM_OBJECT_NULL) {
4472                 vm_object_lock(object);
4473                 object->named = TRUE;
4474                 while (!object->pager_ready) {
4475                         vm_object_sleep(object,
4476                                         VM_OBJECT_EVENT_PAGER_READY,
4477                                         THREAD_UNINT);
4478                 }
4479                 *control = object->pager_control;
4480                 vm_object_unlock(object);
4481         }
4482         return (KERN_SUCCESS);
4483 }
4484
4485
4486 /*
4487  *      Routine:        memory_object_recover_named [user interface]
4488  *      Purpose:
4489  *              Attempt to recover a named reference for a VM object.
4490  *              VM will verify that the object has not already started
4491  *              down the termination path, and if it has, will optionally
4492  *              wait for that to finish.
4493  *      Returns:
4494  *              KERN_SUCCESS - we recovered a named reference on the object
4495  *              KERN_FAILURE - we could not recover a reference (object dead)
4496  *              KERN_INVALID_ARGUMENT - bad memory object control
4497  */
4498 kern_return_t
4499 memory_object_recover_named(
4500         memory_object_control_t control,
4501         boolean_t               wait_on_terminating)
4502 {
4503         vm_object_t             object;
4504
4505         vm_object_cache_lock();
4506         object = memory_object_control_to_vm_object(control);
4507         if (object == VM_OBJECT_NULL) {
4508                 vm_object_cache_unlock();
4509                 return (KERN_INVALID_ARGUMENT);
4510         }
4511
4512 restart:
4513         vm_object_lock(object);
4514
4515         if (object->terminating && wait_on_terminating) {
4516                 vm_object_cache_unlock();
4517                 vm_object_wait(object,
4518                         VM_OBJECT_EVENT_PAGING_IN_PROGRESS,
4519                         THREAD_UNINT);
4520                 vm_object_cache_lock();
4521                 goto restart;
4522         }
4523
4524         if (!object->alive) {
4525                 vm_object_cache_unlock();
4526                 vm_object_unlock(object);
4527                 return KERN_FAILURE;
4528         }
4529
4530         if (object->named == TRUE) {
4531                 vm_object_cache_unlock();
4532                 vm_object_unlock(object);
4533                 return KERN_SUCCESS;
4534         }
4535
4536         if((object->ref_count == 0) && (!object->terminating)){
4537                 queue_remove(&vm_object_cached_list, object,
4538                                      vm_object_t, cached_list);
4539                         vm_object_cached_count--;
4540                         XPR(XPR_VM_OBJECT_CACHE,
4541                        "memory_object_recover_named: removing %X, head (%X, %X)\n",
4542                             (integer_t)object,
4543                             (integer_t)vm_object_cached_list.next,
4544                             (integer_t)vm_object_cached_list.prev, 0,0);
4545         }
4546
4547         vm_object_cache_unlock();
4548
4549         object->named = TRUE;
4550         object->ref_count++;
4551         vm_object_res_reference(object);
4552         while (!object->pager_ready) {
4553                 vm_object_sleep(object,
4554                                 VM_OBJECT_EVENT_PAGER_READY,
4555                                 THREAD_UNINT);
4556         }
4557         vm_object_unlock(object);
4558         return (KERN_SUCCESS);
4559 }
4560
4561
4562 /*
4563  *      vm_object_release_name:
4564  *
4565  *      Enforces name semantic on memory_object reference count decrement
4566  *      This routine should not be called unless the caller holds a name
4567  *      reference gained through the memory_object_create_named.
4568  *
4569  *      If the TERMINATE_IDLE flag is set, the call will return if the
4570  *      reference count is not 1. i.e. idle with the only remaining reference
4571  *      being the name.
4572  *      If the decision is made to proceed the name field flag is set to
4573  *      false and the reference count is decremented.  If the RESPECT_CACHE
4574  *      flag is set and the reference count has gone to zero, the
4575  *      memory_object is checked to see if it is cacheable otherwise when
4576  *      the reference count is zero, it is simply terminated.
4577  */
4578
4579 __private_extern__ kern_return_t
4580 vm_object_release_name(
4581         vm_object_t     object,
4582         int             flags)
4583 {
4584         vm_object_t     shadow;
4585         boolean_t       original_object = TRUE;
4586
4587         while (object != VM_OBJECT_NULL) {
4588
4589                 /*
4590                  *      The cache holds a reference (uncounted) to
4591                  *      the object.  We must locke it before removing
4592                  *      the object.
4593                  *
4594                  */
4595
4596                 vm_object_cache_lock();
4597                 vm_object_lock(object);
4598                 assert(object->alive);
4599                 if(original_object)
4600                         assert(object->named);
4601                 assert(object->ref_count > 0);
4602
4603                 /*
4604                  *      We have to wait for initialization before
4605                  *      destroying or caching the object.
4606                  */
4607
4608                 if (object->pager_created && !object->pager_initialized) {
4609                         assert(!object->can_persist);
4610                         vm_object_assert_wait(object,
4611                                         VM_OBJECT_EVENT_INITIALIZED,
4612                                         THREAD_UNINT);
4613                         vm_object_unlock(object);
4614                         vm_object_cache_unlock();
4615                         thread_block(THREAD_CONTINUE_NULL);
4616                         continue;
4617                 }
4618
4619                 if (((object->ref_count > 1)
4620                         && (flags & MEMORY_OBJECT_TERMINATE_IDLE))
4621                         || (object->terminating)) {
4622                         vm_object_unlock(object);
4623                         vm_object_cache_unlock();
4624                         return KERN_FAILURE;
4625                 } else {
4626                         if (flags & MEMORY_OBJECT_RELEASE_NO_OP) {
4627                                 vm_object_unlock(object);
4628                                 vm_object_cache_unlock();
4629                                 return KERN_SUCCESS;
4630                         }
4631                 }
4632
4633                 if ((flags & MEMORY_OBJECT_RESPECT_CACHE) &&
4634                                         (object->ref_count == 1)) {
4635                         if(original_object)
4636                                 object->named = FALSE;
4637                         vm_object_unlock(object);
4638                         vm_object_cache_unlock();
4639                         /* let vm_object_deallocate push this thing into */
4640                         /* the cache, if that it is where it is bound */
4641                         vm_object_deallocate(object);
4642                         return KERN_SUCCESS;
4643                 }
4644                 VM_OBJ_RES_DECR(object);
4645                 shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
4646                 if(object->ref_count == 1) {
4647                         if(vm_object_terminate(object) != KERN_SUCCESS) {
4648                                 if(original_object) {
4649                                         return KERN_FAILURE;
4650                                 } else {
4651                                         return KERN_SUCCESS;
4652                                 }
4653                         }
4654                         if (shadow != VM_OBJECT_NULL) {
4655                                 original_object = FALSE;
4656                                 object = shadow;
4657                                 continue;
4658                         }
4659                         return KERN_SUCCESS;
4660                 } else {
4661                         object->ref_count--;
4662                         assert(object->ref_count > 0);
4663                         if(original_object)
4664                                 object->named = FALSE;
4665                         vm_object_unlock(object);
4666                         vm_object_cache_unlock();
4667                         return KERN_SUCCESS;
4668                 }
4669         }
4670         /*NOTREACHED*/
4671         assert(0);
4672         return KERN_FAILURE;
4673 }
4674
4675
4676 __private_extern__ kern_return_t
4677 vm_object_lock_request(
4678         vm_object_t                     object,
4679         vm_object_offset_t              offset,
4680         vm_object_size_t                size,
4681         memory_object_return_t          should_return,
4682         int                             flags,
4683         vm_prot_t                       prot)
4684 {
4685         __unused boolean_t      should_flush;
4686
4687         should_flush = flags & MEMORY_OBJECT_DATA_FLUSH;
4688
4689         XPR(XPR_MEMORY_OBJECT,
4690             "vm_o_lock_request, obj 0x%X off 0x%X size 0x%X flags %X prot %X\n",
4691             (integer_t)object, offset, size,
4692             (((should_return&1)<<1)|should_flush), prot);
4693
4694         /*
4695          *      Check for bogus arguments.
4696          */
4697         if (object == VM_OBJECT_NULL)
4698                 return (KERN_INVALID_ARGUMENT);
4699
4700         if ((prot & ~VM_PROT_ALL) != 0 && prot != VM_PROT_NO_CHANGE)
4701                 return (KERN_INVALID_ARGUMENT);
4702
4703         size = round_page_64(size);
4704
4705         /*
4706          *      Lock the object, and acquire a paging reference to
4707          *      prevent the memory_object reference from being released.
4708          */
4709         vm_object_lock(object);
4710         vm_object_paging_begin(object);
4711
4712         (void)vm_object_update(object,
4713                 offset, size, NULL, NULL, should_return, flags, prot);
4714
4715         vm_object_paging_end(object);
4716         vm_object_unlock(object);
4717
4718         return (KERN_SUCCESS);
4719 }
4720
4721 /*
4722  * Empty a purgable object by grabbing the physical pages assigned to it and
4723  * putting them on the free queue without writing them to backing store, etc.
4724  * When the pages are next touched they will be demand zero-fill pages.  We
4725  * skip pages which are busy, being paged in/out, wired, etc.  We do _not_
4726  * skip referenced/dirty pages, pages on the active queue, etc.  We're more
4727  * than happy to grab these since this is a purgable object.  We mark the
4728  * object as "empty" after reaping its pages.
4729  *
4730  * On entry the object and page queues are locked, the object must be a
4731  * purgable object with no delayed copies pending.
4732  */
4733 unsigned int
4734 vm_object_purge(vm_object_t object)
4735 {
4736         vm_page_t       p, next;
4737         unsigned int    num_purged_pages;
4738         vm_page_t       local_freeq;
4739         unsigned long   local_freed;
4740         int             purge_loop_quota;
4741 /* free pages as soon as we gather PURGE_BATCH_FREE_LIMIT pages to free */
4742 #define PURGE_BATCH_FREE_LIMIT  50
4743 /* release page queues lock every PURGE_LOOP_QUOTA iterations */
4744 #define PURGE_LOOP_QUOTA        100
4745
4746         num_purged_pages = 0;
4747         if (object->purgable == VM_OBJECT_NONPURGABLE)
4748                 return num_purged_pages;
4749
4750         object->purgable = VM_OBJECT_PURGABLE_EMPTY;
4751
4752         assert(object->copy == VM_OBJECT_NULL);
4753         assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
4754         purge_loop_quota = PURGE_LOOP_QUOTA;
4755
4756         local_freeq = VM_PAGE_NULL;
4757         local_freed = 0;
4758
4759         /*
4760          * Go through the object's resident pages and try and discard them.
4761          */
4762         next = (vm_page_t)queue_first(&object->memq);
4763         while (!queue_end(&object->memq, (queue_entry_t)next)) {
4764                 p = next;
4765                 next = (vm_page_t)queue_next(&next->listq);
4766
4767                 if (purge_loop_quota-- == 0) {
4768                         /*
4769                          * Avoid holding the page queues lock for too long.
4770                          * Let someone else take it for a while if needed.
4771                          * Keep holding the object's lock to guarantee that
4772                          * the object's page list doesn't change under us
4773                          * while we yield.
4774                          */
4775                         if (local_freeq != VM_PAGE_NULL) {
4776                                 /*
4777                                  * Flush our queue of pages to free.
4778                                  */
4779                                 vm_page_free_list(local_freeq);
4780                                 local_freeq = VM_PAGE_NULL;
4781                                 local_freed = 0;
4782                         }
4783                         vm_page_unlock_queues();
4784                         mutex_pause();
4785                         vm_page_lock_queues();
4786
4787                         /* resume with the current page and a new quota */
4788                         purge_loop_quota = PURGE_LOOP_QUOTA;
4789                 }
4790
4791
4792                 if (p->busy || p->cleaning || p->laundry ||
4793                     p->list_req_pending) {
4794                         /* page is being acted upon, so don't mess with it */
4795                         continue;
4796                 }
4797                 if (p->wire_count) {
4798                         /* don't discard a wired page */
4799                         continue;
4800                 }
4801
4802                 if (p->tabled) {
4803                         /* clean up the object/offset table */
4804                         vm_page_remove(p);
4805                 }
4806                 if (p->absent) {
4807                         /* update the object's count of absent pages */
4808                         vm_object_absent_release(object);
4809                 }
4810
4811                 /* we can discard this page */
4812
4813                 /* advertize that this page is in a transition state */
4814                 p->busy = TRUE;
4815
4816                 if (p->no_isync == TRUE) {
4817                         /* the page hasn't been mapped yet */
4818                         /* (optimization to delay the i-cache sync) */
4819                 } else {
4820                         /* unmap the page */
4821                         int refmod_state;
4822
4823                         refmod_state = pmap_disconnect(p->phys_page);
4824                         if (refmod_state & VM_MEM_MODIFIED) {
4825                                 p->dirty = TRUE;
4826                         }
4827                 }
4828
4829                 if (p->dirty || p->precious) {
4830                         /* we saved the cost of cleaning this page ! */
4831                         num_purged_pages++;
4832                         vm_page_purged_count++;
4833                 }
4834
4835                 /* remove page from active or inactive queue... */
4836                 VM_PAGE_QUEUES_REMOVE(p);
4837
4838                 /* ... and put it on our queue of pages to free */
4839                 assert(!p->laundry);
4840                 assert(p->object != kernel_object);
4841                 assert(p->pageq.next == NULL &&
4842                        p->pageq.prev == NULL);
4843                 p->pageq.next = (queue_entry_t) local_freeq;
4844                 local_freeq = p;
4845                 if (++local_freed >= PURGE_BATCH_FREE_LIMIT) {
4846                         /* flush our queue of pages to free */
4847                         vm_page_free_list(local_freeq);
4848                         local_freeq = VM_PAGE_NULL;
4849                         local_freed = 0;
4850                 }
4851         }
4852
4853         /* flush our local queue of pages to free one last time */
4854         if (local_freeq != VM_PAGE_NULL) {
4855                 vm_page_free_list(local_freeq);
4856                 local_freeq = VM_PAGE_NULL;
4857                 local_freed = 0;
4858         }
4859
4860         return num_purged_pages;
4861 }
4862
4863 /*
4864  * vm_object_purgable_control() allows the caller to control and investigate the
4865  * state of a purgable object.  A purgable object is created via a call to
4866  * vm_allocate() with VM_FLAGS_PURGABLE specified.  A purgable object will
4867  * never be coalesced with any other object -- even other purgable objects --
4868  * and will thus always remain a distinct object.  A purgable object has
4869  * special semantics when its reference count is exactly 1.  If its reference
4870  * count is greater than 1, then a purgable object will behave like a normal
4871  * object and attempts to use this interface will result in an error return
4872  * of KERN_INVALID_ARGUMENT.
4873  *
4874  * A purgable object may be put into a "volatile" state which will make the
4875  * object's pages elligable for being reclaimed without paging to backing
4876  * store if the system runs low on memory.  If the pages in a volatile
4877  * purgable object are reclaimed, the purgable object is said to have been
4878  * "emptied."  When a purgable object is emptied the system will reclaim as
4879  * many pages from the object as it can in a convenient manner (pages already
4880  * en route to backing store or busy for other reasons are left as is).  When
4881  * a purgable object is made volatile, its pages will generally be reclaimed
4882  * before other pages in the application's working set.  This semantic is
4883  * generally used by applications which can recreate the data in the object
4884  * faster than it can be paged in.  One such example might be media assets
4885  * which can be reread from a much faster RAID volume.
4886  *
4887  * A purgable object may be designated as "non-volatile" which means it will
4888  * behave like all other objects in the system with pages being written to and
4889  * read from backing store as needed to satisfy system memory needs.  If the
4890  * object was emptied before the object was made non-volatile, that fact will
4891  * be returned as the old state of the purgable object (see
4892  * VM_PURGABLE_SET_STATE below).  In this case, any pages of the object which
4893  * were reclaimed as part of emptying the object will be refaulted in as
4894  * zero-fill on demand.  It is up to the application to note that an object
4895  * was emptied and recreate the objects contents if necessary.  When a
4896  * purgable object is made non-volatile, its pages will generally not be paged
4897  * out to backing store in the immediate future.  A purgable object may also
4898  * be manually emptied.
4899  *
4900  * Finally, the current state (non-volatile, volatile, volatile & empty) of a
4901  * volatile purgable object may be queried at any time.  This information may
4902  * be used as a control input to let the application know when the system is
4903  * experiencing memory pressure and is reclaiming memory.
4904  *
4905  * The specified address may be any address within the purgable object.  If
4906  * the specified address does not represent any object in the target task's
4907  * virtual address space, then KERN_INVALID_ADDRESS will be returned.  If the
4908  * object containing the specified address is not a purgable object, then
4909  * KERN_INVALID_ARGUMENT will be returned.  Otherwise, KERN_SUCCESS will be
4910  * returned.
4911  *
4912  * The control parameter may be any one of VM_PURGABLE_SET_STATE or
4913  * VM_PURGABLE_GET_STATE.  For VM_PURGABLE_SET_STATE, the in/out parameter
4914  * state is used to set the new state of the purgable object and return its
4915  * old state.  For VM_PURGABLE_GET_STATE, the current state of the purgable
4916  * object is returned in the parameter state.
4917  *
4918  * The in/out parameter state may be one of VM_PURGABLE_NONVOLATILE,
4919  * VM_PURGABLE_VOLATILE or VM_PURGABLE_EMPTY.  These, respectively, represent
4920  * the non-volatile, volatile and volatile/empty states described above.
4921  * Setting the state of a purgable object to VM_PURGABLE_EMPTY will
4922  * immediately reclaim as many pages in the object as can be conveniently
4923  * collected (some may have already been written to backing store or be
4924  * otherwise busy).
4925  *
4926  * The process of making a purgable object non-volatile and determining its
4927  * previous state is atomic.  Thus, if a purgable object is made
4928  * VM_PURGABLE_NONVOLATILE and the old state is returned as
4929  * VM_PURGABLE_VOLATILE, then the purgable object's previous contents are
4930  * completely intact and will remain so until the object is made volatile
4931  * again.  If the old state is returned as VM_PURGABLE_EMPTY then the object
4932  * was reclaimed while it was in a volatile state and its previous contents
4933  * have been lost.
4934  */
4935 /*
4936  * The object must be locked.
4937  */
4938 kern_return_t
4939 vm_object_purgable_control(
4940         vm_object_t     object,
4941         vm_purgable_t   control,
4942         int             *state)
4943 {
4944         int             old_state;
4945         vm_page_t       p;
4946
4947         if (object == VM_OBJECT_NULL) {
4948                 /*
4949                  * Object must already be present or it can't be purgable.
4950                  */
4951                 return KERN_INVALID_ARGUMENT;
4952         }
4953
4954         /*
4955          * Get current state of the purgable object.
4956          */
4957         switch (object->purgable) {
4958             case VM_OBJECT_NONPURGABLE:
4959                 return KERN_INVALID_ARGUMENT;
4960
4961             case VM_OBJECT_PURGABLE_NONVOLATILE:
4962                 old_state = VM_PURGABLE_NONVOLATILE;
4963                 break;
4964
4965             case VM_OBJECT_PURGABLE_VOLATILE:
4966                 old_state = VM_PURGABLE_VOLATILE;
4967                 break;
4968
4969             case VM_OBJECT_PURGABLE_EMPTY:
4970                 old_state = VM_PURGABLE_EMPTY;
4971                 break;
4972
4973             default:
4974                 old_state = VM_PURGABLE_NONVOLATILE;
4975                 panic("Bad state (%d) for purgable object!\n",
4976                       object->purgable);
4977                 /*NOTREACHED*/
4978         }
4979
4980         /* purgable cant have delayed copies - now or in the future */
4981         assert(object->copy == VM_OBJECT_NULL);
4982         assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
4983
4984         /*
4985          * Execute the desired operation.
4986          */
4987         if (control == VM_PURGABLE_GET_STATE) {
4988                 *state = old_state;
4989                 return KERN_SUCCESS;
4990         }
4991
4992         switch (*state) {
4993         case VM_PURGABLE_NONVOLATILE:
4994                 vm_page_lock_queues();
4995                 if (object->purgable != VM_OBJECT_PURGABLE_NONVOLATILE) {
4996                         assert(vm_page_purgeable_count >=
4997                                object->resident_page_count);
4998                         vm_page_purgeable_count -= object->resident_page_count;
4999                 }
5000
5001                 object->purgable = VM_OBJECT_PURGABLE_NONVOLATILE;
5002
5003                 /*
5004                  * If the object wasn't emptied, then mark all pages of the
5005                  * object as referenced in order to give them a complete turn
5006                  * of the virtual memory "clock" before becoming candidates
5007                  * for paging out (if the system is suffering from memory
5008                  * pressure).  We don't really need to set the pmap reference
5009                  * bits (which would be expensive) since the software copies
5010                  * are believed if they're set to true ...
5011                  */
5012                 if (old_state != VM_PURGABLE_EMPTY) {
5013                         for (p = (vm_page_t)queue_first(&object->memq);
5014                              !queue_end(&object->memq, (queue_entry_t)p);
5015                              p = (vm_page_t)queue_next(&p->listq))
5016                                 p->reference = TRUE;
5017                 }
5018
5019                 vm_page_unlock_queues();
5020
5021                 break;
5022
5023         case VM_PURGABLE_VOLATILE:
5024                 vm_page_lock_queues();
5025
5026                 if (object->purgable != VM_OBJECT_PURGABLE_VOLATILE &&
5027                     object->purgable != VM_OBJECT_PURGABLE_EMPTY) {
5028                         vm_page_purgeable_count += object->resident_page_count;
5029                 }
5030
5031                 object->purgable = VM_OBJECT_PURGABLE_VOLATILE;
5032
5033                 /*
5034                  * We want the newly volatile purgable object to be a
5035                  * candidate for the pageout scan before other pages in the
5036                  * application if the system is suffering from memory
5037                  * pressure.  To do this, we move a page of the object from
5038                  * the active queue onto the inactive queue in order to
5039                  * promote the object for early reclaim.  We only need to move
5040                  * a single page since the pageout scan will reap the entire
5041                  * purgable object if it finds a single page in a volatile
5042                  * state.  Obviously we don't do this if there are no pages
5043                  * associated with the object or we find a page of the object
5044                  * already on the inactive queue.
5045                  */
5046                 for (p = (vm_page_t)queue_first(&object->memq);
5047                      !queue_end(&object->memq, (queue_entry_t)p);
5048                      p = (vm_page_t)queue_next(&p->listq)) {
5049                         if (p->inactive) {
5050                                 /* already a page on the inactive queue */
5051                                 break;
5052                         }
5053                         if (p->active && !p->busy) {
5054                                 /* found one we can move */
5055                                 vm_page_deactivate(p);
5056                                 break;
5057                         }
5058                 }
5059                 vm_page_unlock_queues();
5060
5061                 break;
5062
5063
5064         case VM_PURGABLE_EMPTY:
5065                 vm_page_lock_queues();
5066                 if (object->purgable != VM_OBJECT_PURGABLE_VOLATILE &&
5067                     object->purgable != VM_OBJECT_PURGABLE_EMPTY) {
5068                         vm_page_purgeable_count += object->resident_page_count;
5069                 }
5070                 (void) vm_object_purge(object);
5071                 vm_page_unlock_queues();
5072                 break;
5073
5074         }
5075         *state = old_state;
5076
5077         return KERN_SUCCESS;
5078 }
5079
5080 #if     TASK_SWAPPER
5081 /*
5082  * vm_object_res_deallocate
5083  *
5084  * (recursively) decrement residence counts on vm objects and their shadows.
5085  * Called from vm_object_deallocate and when swapping out an object.
5086  *
5087  * The object is locked, and remains locked throughout the function,
5088  * even as we iterate down the shadow chain.  Locks on intermediate objects
5089  * will be dropped, but not the original object.
5090  *
5091  * NOTE: this function used to use recursion, rather than iteration.
5092  */
5093
5094 __private_extern__ void
5095 vm_object_res_deallocate(
5096         vm_object_t     object)
5097 {
5098         vm_object_t orig_object = object;
5099         /*
5100          * Object is locked so it can be called directly
5101          * from vm_object_deallocate.  Original object is never
5102          * unlocked.
5103          */
5104         assert(object->res_count > 0);
5105         while  (--object->res_count == 0) {
5106                 assert(object->ref_count >= object->res_count);
5107                 vm_object_deactivate_all_pages(object);
5108                 /* iterate on shadow, if present */
5109                 if (object->shadow != VM_OBJECT_NULL) {
5110                         vm_object_t tmp_object = object->shadow;
5111                         vm_object_lock(tmp_object);
5112                         if (object != orig_object)
5113                                 vm_object_unlock(object);
5114                         object = tmp_object;
5115                         assert(object->res_count > 0);
5116                 } else
5117                         break;
5118         }
5119         if (object != orig_object)
5120                 vm_object_unlock(object);
5121 }
5122
5123 /*
5124  * vm_object_res_reference
5125  *
5126  * Internal function to increment residence count on a vm object
5127  * and its shadows.  It is called only from vm_object_reference, and
5128  * when swapping in a vm object, via vm_map_swap.
5129  *
5130  * The object is locked, and remains locked throughout the function,
5131  * even as we iterate down the shadow chain.  Locks on intermediate objects
5132  * will be dropped, but not the original object.
5133  *
5134  * NOTE: this function used to use recursion, rather than iteration.
5135  */
5136
5137 __private_extern__ void
5138 vm_object_res_reference(
5139         vm_object_t     object)
5140 {
5141         vm_object_t orig_object = object;
5142         /*
5143          * Object is locked, so this can be called directly
5144          * from vm_object_reference.  This lock is never released.
5145          */
5146         while  ((++object->res_count == 1)  &&
5147                 (object->shadow != VM_OBJECT_NULL)) {
5148                 vm_object_t tmp_object = object->shadow;
5149
5150                 assert(object->ref_count >= object->res_count);
5151                 vm_object_lock(tmp_object);
5152                 if (object != orig_object)
5153                         vm_object_unlock(object);
5154                 object = tmp_object;
5155         }
5156         if (object != orig_object)
5157                 vm_object_unlock(object);
5158         assert(orig_object->ref_count >= orig_object->res_count);
5159 }
5160 #endif  /* TASK_SWAPPER */
5161
5162 /*
5163  *      vm_object_reference:
5164  *
5165  *      Gets another reference to the given object.
5166  */
5167 #ifdef vm_object_reference
5168 #undef vm_object_reference
5169 #endif
5170 __private_extern__ void
5171 vm_object_reference(
5172         register vm_object_t    object)
5173 {
5174         if (object == VM_OBJECT_NULL)
5175                 return;
5176
5177         vm_object_lock(object);
5178         assert(object->ref_count > 0);
5179         vm_object_reference_locked(object);
5180         vm_object_unlock(object);
5181 }
5182
5183 #ifdef MACH_BSD
5184 /*
5185  * Scale the vm_object_cache
5186  * This is required to make sure that the vm_object_cache is big
5187  * enough to effectively cache the mapped file.
5188  * This is really important with UBC as all the regular file vnodes
5189  * have memory object associated with them. Havving this cache too
5190  * small results in rapid reclaim of vnodes and hurts performance a LOT!
5191  *
5192  * This is also needed as number of vnodes can be dynamically scaled.
5193  */
5194 kern_return_t
5195 adjust_vm_object_cache(
5196         __unused vm_size_t oval,
5197         vm_size_t nval)
5198 {
5199         vm_object_cached_max = nval;
5200         vm_object_cache_trim(FALSE);
5201         return (KERN_SUCCESS);
5202 }
5203 #endif /* MACH_BSD */
5204
5205
5206 /*
5207  * vm_object_transpose
5208  *
5209  * This routine takes two VM objects of the same size and exchanges
5210  * their backing store.
5211  * The objects should be "quiesced" via a UPL operation with UPL_SET_IO_WIRE
5212  * and UPL_BLOCK_ACCESS if they are referenced anywhere.
5213  *
5214  * The VM objects must not be locked by caller.
5215  */
5216 kern_return_t
5217 vm_object_transpose(
5218         vm_object_t             object1,
5219         vm_object_t             object2,
5220         vm_object_size_t        transpose_size)
5221 {
5222         vm_object_t             tmp_object;
5223         kern_return_t           retval;
5224         boolean_t               object1_locked, object2_locked;
5225         boolean_t               object1_paging, object2_paging;
5226         vm_page_t               page;
5227         vm_object_offset_t      page_offset;
5228
5229         tmp_object = VM_OBJECT_NULL;
5230         object1_locked = FALSE; object2_locked = FALSE;
5231         object1_paging = FALSE; object2_paging = FALSE;
5232
5233         if (object1 == object2 ||
5234             object1 == VM_OBJECT_NULL ||
5235             object2 == VM_OBJECT_NULL) {
5236                 /*
5237                  * If the 2 VM objects are the same, there's
5238                  * no point in exchanging their backing store.
5239                  */
5240                 retval = KERN_INVALID_VALUE;
5241                 goto done;
5242         }
5243
5244         vm_object_lock(object1);
5245         object1_locked = TRUE;
5246         if (object1->copy || object1->shadow || object1->shadowed ||
5247             object1->purgable != VM_OBJECT_NONPURGABLE) {
5248                 /*
5249                  * We don't deal with copy or shadow objects (yet).
5250                  */
5251                 retval = KERN_INVALID_VALUE;
5252                 goto done;
5253         }
5254         /*
5255          * Since we're about to mess with the object's backing store,
5256          * mark it as "paging_in_progress".  Note that this is not enough
5257          * to prevent any paging activity on this object, so the caller should
5258          * have "quiesced" the objects beforehand, via a UPL operation with
5259          * UPL_SET_IO_WIRE (to make sure all the pages are there and wired)
5260          * and UPL_BLOCK_ACCESS (to mark the pages "busy").
5261          */
5262         vm_object_paging_begin(object1);
5263         object1_paging = TRUE;
5264         vm_object_unlock(object1);
5265         object1_locked = FALSE;
5266
5267         /*
5268          * Same as above for the 2nd object...
5269          */
5270         vm_object_lock(object2);
5271         object2_locked = TRUE;
5272         if (object2->copy || object2->shadow || object2->shadowed ||
5273             object2->purgable != VM_OBJECT_NONPURGABLE) {
5274                 retval = KERN_INVALID_VALUE;
5275                 goto done;
5276         }
5277         vm_object_paging_begin(object2);
5278         object2_paging = TRUE;
5279         vm_object_unlock(object2);
5280         object2_locked = FALSE;
5281
5282         /*
5283          * Allocate a temporary VM object to hold object1's contents
5284          * while we copy object2 to object1.
5285          */
5286         tmp_object = vm_object_allocate(transpose_size);
5287         vm_object_lock(tmp_object);
5288         vm_object_paging_begin(tmp_object);
5289         tmp_object->can_persist = FALSE;
5290
5291         /*
5292          * Since we need to lock both objects at the same time,
5293          * make sure we always lock them in the same order to
5294          * avoid deadlocks.
5295          */
5296         if (object1 < object2) {
5297                 vm_object_lock(object1);
5298                 vm_object_lock(object2);
5299         } else {
5300                 vm_object_lock(object2);
5301                 vm_object_lock(object1);
5302         }
5303         object1_locked = TRUE;
5304         object2_locked = TRUE;
5305
5306         if (object1->size != object2->size ||
5307             object1->size != transpose_size) {
5308                 /*
5309                  * If the 2 objects don't have the same size, we can't
5310                  * exchange their backing stores or one would overflow.
5311                  * If their size doesn't match the caller's
5312                  * "transpose_size", we can't do it either because the
5313                  * transpose operation will affect the entire span of
5314                  * the objects.
5315                  */
5316                 retval = KERN_INVALID_VALUE;
5317                 goto done;
5318         }
5319
5320
5321         /*
5322          * Transpose the lists of resident pages.
5323          */
5324         if (object1->phys_contiguous || queue_empty(&object1->memq)) {
5325                 /*
5326                  * No pages in object1, just transfer pages
5327                  * from object2 to object1.  No need to go through
5328                  * an intermediate object.
5329                  */
5330                 while (!queue_empty(&object2->memq)) {
5331                         page = (vm_page_t) queue_first(&object2->memq);
5332                         vm_page_rename(page, object1, page->offset);
5333                 }
5334                 assert(queue_empty(&object2->memq));
5335         } else if (object2->phys_contiguous || queue_empty(&object2->memq)) {
5336                 /*
5337                  * No pages in object2, just transfer pages
5338                  * from object1 to object2.  No need to go through
5339                  * an intermediate object.
5340                  */
5341                 while (!queue_empty(&object1->memq)) {
5342                         page = (vm_page_t) queue_first(&object1->memq);
5343                         vm_page_rename(page, object2, page->offset);
5344                 }
5345                 assert(queue_empty(&object1->memq));
5346         } else {
5347                 /* transfer object1's pages to tmp_object */
5348                 vm_page_lock_queues();
5349                 while (!queue_empty(&object1->memq)) {
5350                         page = (vm_page_t) queue_first(&object1->memq);
5351                         page_offset = page->offset;
5352                         vm_page_remove(page);
5353                         page->offset = page_offset;
5354                         queue_enter(&tmp_object->memq, page, vm_page_t, listq);
5355                 }
5356                 vm_page_unlock_queues();
5357                 assert(queue_empty(&object1->memq));
5358                 /* transfer object2's pages to object1 */
5359                 while (!queue_empty(&object2->memq)) {
5360                         page = (vm_page_t) queue_first(&object2->memq);
5361                         vm_page_rename(page, object1, page->offset);
5362                 }
5363                 assert(queue_empty(&object2->memq));
5364                 /* transfer tmp_object's pages to object1 */
5365                 while (!queue_empty(&tmp_object->memq)) {
5366                         page = (vm_page_t) queue_first(&tmp_object->memq);
5367                         queue_remove(&tmp_object->memq, page,
5368                                      vm_page_t, listq);
5369                         vm_page_insert(page, object2, page->offset);
5370                 }
5371                 assert(queue_empty(&tmp_object->memq));
5372         }
5373
5374         /* no need to transpose the size: they should be identical */
5375         assert(object1->size == object2->size);
5376
5377 #define __TRANSPOSE_FIELD(field)                                \
5378 MACRO_BEGIN                                                     \
5379         tmp_object->field = object1->field;                     \
5380         object1->field = object2->field;                        \
5381         object2->field = tmp_object->field;                     \
5382 MACRO_END
5383
5384         assert(!object1->copy);
5385         assert(!object2->copy);
5386
5387         assert(!object1->shadow);
5388         assert(!object2->shadow);
5389
5390         __TRANSPOSE_FIELD(shadow_offset); /* used by phys_contiguous objects */
5391         __TRANSPOSE_FIELD(pager);
5392         __TRANSPOSE_FIELD(paging_offset);
5393
5394         __TRANSPOSE_FIELD(pager_control);
5395         /* update the memory_objects' pointers back to the VM objects */
5396         if (object1->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
5397                 memory_object_control_collapse(object1->pager_control,
5398                                                object1);
5399         }
5400         if (object2->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
5401                 memory_object_control_collapse(object2->pager_control,
5402                                                object2);
5403         }
5404
5405         __TRANSPOSE_FIELD(absent_count);
5406
5407         assert(object1->paging_in_progress);
5408         assert(object2->paging_in_progress);
5409
5410         __TRANSPOSE_FIELD(pager_created);
5411         __TRANSPOSE_FIELD(pager_initialized);
5412         __TRANSPOSE_FIELD(pager_ready);
5413         __TRANSPOSE_FIELD(pager_trusted);
5414         __TRANSPOSE_FIELD(internal);
5415         __TRANSPOSE_FIELD(temporary);
5416         __TRANSPOSE_FIELD(private);
5417         __TRANSPOSE_FIELD(pageout);
5418         __TRANSPOSE_FIELD(true_share);
5419         __TRANSPOSE_FIELD(phys_contiguous);
5420         __TRANSPOSE_FIELD(nophyscache);
5421         __TRANSPOSE_FIELD(last_alloc);
5422         __TRANSPOSE_FIELD(sequential);
5423         __TRANSPOSE_FIELD(cluster_size);
5424         __TRANSPOSE_FIELD(existence_map);
5425         __TRANSPOSE_FIELD(cow_hint);
5426         __TRANSPOSE_FIELD(wimg_bits);
5427
5428 #undef __TRANSPOSE_FIELD
5429
5430         retval = KERN_SUCCESS;
5431
5432 done:
5433         /*
5434          * Cleanup.
5435          */
5436         if (tmp_object != VM_OBJECT_NULL) {
5437                 vm_object_paging_end(tmp_object);
5438                 vm_object_unlock(tmp_object);
5439                 /*
5440                  * Re-initialize the temporary object to avoid
5441                  * deallocating a real pager.
5442                  */
5443                 _vm_object_allocate(transpose_size, tmp_object);
5444                 vm_object_deallocate(tmp_object);
5445                 tmp_object = VM_OBJECT_NULL;
5446         }
5447
5448         if (object1_locked) {
5449                 vm_object_unlock(object1);
5450                 object1_locked = FALSE;
5451         }
5452         if (object2_locked) {
5453                 vm_object_unlock(object2);
5454                 object2_locked = FALSE;
5455         }
5456         if (object1_paging) {
5457                 vm_object_lock(object1);
5458                 vm_object_paging_end(object1);
5459                 vm_object_unlock(object1);
5460                 object1_paging = FALSE;
5461         }
5462         if (object2_paging) {
5463                 vm_object_lock(object2);
5464                 vm_object_paging_end(object2);
5465                 vm_object_unlock(object2);
5466                 object2_paging = FALSE;
5467         }
5468
5469         return retval;
5470 }
5471
5472
5473 /* Allow manipulation of individual page state.  This is actually part of */
5474 /* the UPL regimen but takes place on the VM object rather than on a UPL */
5475
5476 kern_return_t
5477 vm_object_page_op(
5478         vm_object_t             object,
5479         vm_object_offset_t      offset,
5480         int                     ops,
5481         ppnum_t                 *phys_entry,
5482         int                     *flags)
5483 {
5484         vm_page_t               dst_page;
5485
5486         vm_object_lock(object);
5487
5488         if(ops & UPL_POP_PHYSICAL) {
5489                 if(object->phys_contiguous) {
5490                         if (phys_entry) {
5491                                 *phys_entry = (ppnum_t)
5492                                         (object->shadow_offset >> 12);
5493                         }
5494                         vm_object_unlock(object);
5495                         return KERN_SUCCESS;
5496                 } else {
5497                         vm_object_unlock(object);
5498                         return KERN_INVALID_OBJECT;
5499                 }
5500         }
5501         if(object->phys_contiguous) {
5502                 vm_object_unlock(object);
5503                 return KERN_INVALID_OBJECT;
5504         }
5505
5506         while(TRUE) {
5507                 if((dst_page = vm_page_lookup(object,offset)) == VM_PAGE_NULL) {
5508                         vm_object_unlock(object);
5509                         return KERN_FAILURE;
5510                 }
5511
5512                 /* Sync up on getting the busy bit */
5513                 if((dst_page->busy || dst_page->cleaning) &&
5514                            (((ops & UPL_POP_SET) &&
5515                            (ops & UPL_POP_BUSY)) || (ops & UPL_POP_DUMP))) {
5516                         /* someone else is playing with the page, we will */
5517                         /* have to wait */
5518                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5519                         continue;
5520                 }
5521
5522                 if (ops & UPL_POP_DUMP) {
5523                         vm_page_lock_queues();
5524
5525                         if (dst_page->no_isync == FALSE)
5526                                 pmap_disconnect(dst_page->phys_page);
5527                         vm_page_free(dst_page);
5528
5529                         vm_page_unlock_queues();
5530                         break;
5531                 }
5532
5533                 if (flags) {
5534                         *flags = 0;
5535
5536                         /* Get the condition of flags before requested ops */
5537                         /* are undertaken */
5538
5539                         if(dst_page->dirty) *flags |= UPL_POP_DIRTY;
5540                         if(dst_page->pageout) *flags |= UPL_POP_PAGEOUT;
5541                         if(dst_page->precious) *flags |= UPL_POP_PRECIOUS;
5542                         if(dst_page->absent) *flags |= UPL_POP_ABSENT;
5543                         if(dst_page->busy) *flags |= UPL_POP_BUSY;
5544                 }
5545
5546                 /* The caller should have made a call either contingent with */
5547                 /* or prior to this call to set UPL_POP_BUSY */
5548                 if(ops & UPL_POP_SET) {
5549                         /* The protection granted with this assert will */
5550                         /* not be complete.  If the caller violates the */
5551                         /* convention and attempts to change page state */
5552                         /* without first setting busy we may not see it */
5553                         /* because the page may already be busy.  However */
5554                         /* if such violations occur we will assert sooner */
5555                         /* or later. */
5556                         assert(dst_page->busy || (ops & UPL_POP_BUSY));
5557                         if (ops & UPL_POP_DIRTY) dst_page->dirty = TRUE;
5558                         if (ops & UPL_POP_PAGEOUT) dst_page->pageout = TRUE;
5559                         if (ops & UPL_POP_PRECIOUS) dst_page->precious = TRUE;
5560                         if (ops & UPL_POP_ABSENT) dst_page->absent = TRUE;
5561                         if (ops & UPL_POP_BUSY) dst_page->busy = TRUE;
5562                 }
5563
5564                 if(ops & UPL_POP_CLR) {
5565                         assert(dst_page->busy);
5566                         if (ops & UPL_POP_DIRTY) dst_page->dirty = FALSE;
5567                         if (ops & UPL_POP_PAGEOUT) dst_page->pageout = FALSE;
5568                         if (ops & UPL_POP_PRECIOUS) dst_page->precious = FALSE;
5569                         if (ops & UPL_POP_ABSENT) dst_page->absent = FALSE;
5570                         if (ops & UPL_POP_BUSY) {
5571                                 dst_page->busy = FALSE;
5572                                 PAGE_WAKEUP(dst_page);
5573                         }
5574                 }
5575
5576                 if (dst_page->encrypted) {
5577                         /*
5578                          * ENCRYPTED SWAP:
5579                          * We need to decrypt this encrypted page before the
5580                          * caller can access its contents.
5581                          * But if the caller really wants to access the page's
5582                          * contents, they have to keep the page "busy".
5583                          * Otherwise, the page could get recycled or re-encrypted
5584                          * at any time.
5585                          */
5586                         if ((ops & UPL_POP_SET) && (ops & UPL_POP_BUSY) &&
5587                             dst_page->busy) {
5588                                 /*
5589                                  * The page is stable enough to be accessed by
5590                                  * the caller, so make sure its contents are
5591                                  * not encrypted.
5592                                  */
5593                                 vm_page_decrypt(dst_page, 0);
5594                         } else {
5595                                 /*
5596                                  * The page is not busy, so don't bother
5597                                  * decrypting it, since anything could
5598                                  * happen to it between now and when the
5599                                  * caller wants to access it.
5600                                  * We should not give the caller access
5601                                  * to this page.
5602                                  */
5603                                 assert(!phys_entry);
5604                         }
5605                 }
5606
5607                 if (phys_entry) {
5608                         /*
5609                          * The physical page number will remain valid
5610                          * only if the page is kept busy.
5611                          * ENCRYPTED SWAP: make sure we don't let the
5612                          * caller access an encrypted page.
5613                          */
5614                         assert(dst_page->busy);
5615                         assert(!dst_page->encrypted);
5616                         *phys_entry = dst_page->phys_page;
5617                 }
5618
5619                 break;
5620         }
5621
5622         vm_object_unlock(object);
5623         return KERN_SUCCESS;
5624
5625 }
5626
5627 /*
5628  * vm_object_range_op offers performance enhancement over
5629  * vm_object_page_op for page_op functions which do not require page
5630  * level state to be returned from the call.  Page_op was created to provide
5631  * a low-cost alternative to page manipulation via UPLs when only a single
5632  * page was involved.  The range_op call establishes the ability in the _op
5633  * family of functions to work on multiple pages where the lack of page level
5634  * state handling allows the caller to avoid the overhead of the upl structures.
5635  */
5636
5637 kern_return_t
5638 vm_object_range_op(
5639         vm_object_t             object,
5640         vm_object_offset_t      offset_beg,
5641         vm_object_offset_t      offset_end,
5642         int                     ops,
5643         int                     *range)
5644 {
5645         vm_object_offset_t      offset;
5646         vm_page_t               dst_page;
5647
5648         if (object->resident_page_count == 0) {
5649                 if (range) {
5650                         if (ops & UPL_ROP_PRESENT)
5651                                 *range = 0;
5652                         else
5653                                 *range = offset_end - offset_beg;
5654                 }
5655                 return KERN_SUCCESS;
5656         }
5657         vm_object_lock(object);
5658
5659         if (object->phys_contiguous) {
5660                 vm_object_unlock(object);
5661                 return KERN_INVALID_OBJECT;
5662         }
5663
5664         offset = offset_beg;
5665
5666         while (offset < offset_end) {
5667                 dst_page = vm_page_lookup(object, offset);
5668                 if (dst_page != VM_PAGE_NULL) {
5669                         if (ops & UPL_ROP_DUMP) {
5670                                 if (dst_page->busy || dst_page->cleaning) {
5671                                         /*
5672                                          * someone else is playing with the
5673                                          * page, we will have to wait
5674                                          */
5675                                         PAGE_SLEEP(object,
5676                                                 dst_page, THREAD_UNINT);
5677                                         /*
5678                                          * need to relook the page up since it's
5679                                          * state may have changed while we slept
5680                                          * it might even belong to a different object
5681                                          * at this point
5682                                          */
5683                                         continue;
5684                                 }
5685                                 vm_page_lock_queues();
5686
5687                                 if (dst_page->no_isync == FALSE)
5688                                         pmap_disconnect(dst_page->phys_page);
5689                                 vm_page_free(dst_page);
5690
5691                                 vm_page_unlock_queues();
5692                         } else if (ops & UPL_ROP_ABSENT)
5693                                 break;
5694                 } else if (ops & UPL_ROP_PRESENT)
5695                         break;
5696
5697                 offset += PAGE_SIZE;
5698         }
5699         vm_object_unlock(object);
5700
5701         if (range)
5702                 *range = offset - offset_beg;
5703
5704         return KERN_SUCCESS;
5705 }