osfmk/vm/vm_object.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_object.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Virtual memory object module.
  63  */
  64
  65 #include <debug.h>
  66 #include <mach_pagemap.h>
  67 #include <task_swapper.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/memory_object.h>
  71 #include <mach/memory_object_default.h>
  72 #include <mach/memory_object_control_server.h>
  73 #include <mach/vm_param.h>
  74
  75 #include <ipc/ipc_types.h>
  76 #include <ipc/ipc_port.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/assert.h>
  80 #include <kern/lock.h>
  81 #include <kern/queue.h>
  82 #include <kern/xpr.h>
  83 #include <kern/zalloc.h>
  84 #include <kern/host.h>
  85 #include <kern/host_statistics.h>
  86 #include <kern/processor.h>
  87 #include <kern/misc_protos.h>
  88
  89 #include <vm/memory_object.h>
  90 #include <vm/vm_fault.h>
  91 #include <vm/vm_map.h>
  92 #include <vm/vm_object.h>
  93 #include <vm/vm_page.h>
  94 #include <vm/vm_pageout.h>
  95 #include <vm/vm_protos.h>
  96 #include <vm/vm_purgeable_internal.h>
  97
  98 /*
  99  *      Virtual memory objects maintain the actual data
 100  *      associated with allocated virtual memory.  A given
 101  *      page of memory exists within exactly one object.
 102  *
 103  *      An object is only deallocated when all "references"
 104  *      are given up.
 105  *
 106  *      Associated with each object is a list of all resident
 107  *      memory pages belonging to that object; this list is
 108  *      maintained by the "vm_page" module, but locked by the object's
 109  *      lock.
 110  *
 111  *      Each object also records the memory object reference
 112  *      that is used by the kernel to request and write
 113  *      back data (the memory object, field "pager"), etc...
 114  *
 115  *      Virtual memory objects are allocated to provide
 116  *      zero-filled memory (vm_allocate) or map a user-defined
 117  *      memory object into a virtual address space (vm_map).
 118  *
 119  *      Virtual memory objects that refer to a user-defined
 120  *      memory object are called "permanent", because all changes
 121  *      made in virtual memory are reflected back to the
 122  *      memory manager, which may then store it permanently.
 123  *      Other virtual memory objects are called "temporary",
 124  *      meaning that changes need be written back only when
 125  *      necessary to reclaim pages, and that storage associated
 126  *      with the object can be discarded once it is no longer
 127  *      mapped.
 128  *
 129  *      A permanent memory object may be mapped into more
 130  *      than one virtual address space.  Moreover, two threads
 131  *      may attempt to make the first mapping of a memory
 132  *      object concurrently.  Only one thread is allowed to
 133  *      complete this mapping; all others wait for the
 134  *      "pager_initialized" field is asserted, indicating
 135  *      that the first thread has initialized all of the
 136  *      necessary fields in the virtual memory object structure.
 137  *
 138  *      The kernel relies on a *default memory manager* to
 139  *      provide backing storage for the zero-filled virtual
 140  *      memory objects.  The pager memory objects associated
 141  *      with these temporary virtual memory objects are only
 142  *      requested from the default memory manager when it
 143  *      becomes necessary.  Virtual memory objects
 144  *      that depend on the default memory manager are called
 145  *      "internal".  The "pager_created" field is provided to
 146  *      indicate whether these ports have ever been allocated.
 147  *
 148  *      The kernel may also create virtual memory objects to
 149  *      hold changed pages after a copy-on-write operation.
 150  *      In this case, the virtual memory object (and its
 151  *      backing storage -- its memory object) only contain
 152  *      those pages that have been changed.  The "shadow"
 153  *      field refers to the virtual memory object that contains
 154  *      the remainder of the contents.  The "shadow_offset"
 155  *      field indicates where in the "shadow" these contents begin.
 156  *      The "copy" field refers to a virtual memory object
 157  *      to which changed pages must be copied before changing
 158  *      this object, in order to implement another form
 159  *      of copy-on-write optimization.
 160  *
 161  *      The virtual memory object structure also records
 162  *      the attributes associated with its memory object.
 163  *      The "pager_ready", "can_persist" and "copy_strategy"
 164  *      fields represent those attributes.  The "cached_list"
 165  *      field is used in the implementation of the persistence
 166  *      attribute.
 167  *
 168  * ZZZ Continue this comment.
 169  */
 170
 171 /* Forward declarations for internal functions. */
 172 static kern_return_t    vm_object_terminate(
 173                                 vm_object_t     object);
 174
 175 extern void             vm_object_remove(
 176                                 vm_object_t     object);
 177
 178 static vm_object_t      vm_object_cache_trim(
 179                                 boolean_t called_from_vm_object_deallocate);
 180
 181 static void             vm_object_deactivate_all_pages(
 182                                 vm_object_t     object);
 183
 184 static kern_return_t    vm_object_copy_call(
 185                                 vm_object_t             src_object,
 186                                 vm_object_offset_t      src_offset,
 187                                 vm_object_size_t        size,
 188                                 vm_object_t             *_result_object);
 189
 190 static void             vm_object_do_collapse(
 191                                 vm_object_t     object,
 192                                 vm_object_t     backing_object);
 193
 194 static void             vm_object_do_bypass(
 195                                 vm_object_t     object,
 196                                 vm_object_t     backing_object);
 197
 198 static void             vm_object_release_pager(
 199                                 memory_object_t pager);
 200
 201 static zone_t           vm_object_zone;         /* vm backing store zone */
 202
 203 /*
 204  *      All wired-down kernel memory belongs to a single virtual
 205  *      memory object (kernel_object) to avoid wasting data structures.
 206  */
 207 static struct vm_object                 kernel_object_store;
 208 vm_object_t                                             kernel_object;
 209
 210
 211 /*
 212  *      The submap object is used as a placeholder for vm_map_submap
 213  *      operations.  The object is declared in vm_map.c because it
 214  *      is exported by the vm_map module.  The storage is declared
 215  *      here because it must be initialized here.
 216  */
 217 static struct vm_object                 vm_submap_object_store;
 218
 219 /*
 220  *      Virtual memory objects are initialized from
 221  *      a template (see vm_object_allocate).
 222  *
 223  *      When adding a new field to the virtual memory
 224  *      object structure, be sure to add initialization
 225  *      (see _vm_object_allocate()).
 226  */
 227 static struct vm_object                 vm_object_template;
 228
 229 /*
 230  *      Virtual memory objects that are not referenced by
 231  *      any address maps, but that are allowed to persist
 232  *      (an attribute specified by the associated memory manager),
 233  *      are kept in a queue (vm_object_cached_list).
 234  *
 235  *      When an object from this queue is referenced again,
 236  *      for example to make another address space mapping,
 237  *      it must be removed from the queue.  That is, the
 238  *      queue contains *only* objects with zero references.
 239  *
 240  *      The kernel may choose to terminate objects from this
 241  *      queue in order to reclaim storage.  The current policy
 242  *      is to permit a fixed maximum number of unreferenced
 243  *      objects (vm_object_cached_max).
 244  *
 245  *      A spin lock (accessed by routines
 246  *      vm_object_cache_{lock,lock_try,unlock}) governs the
 247  *      object cache.  It must be held when objects are
 248  *      added to or removed from the cache (in vm_object_terminate).
 249  *      The routines that acquire a reference to a virtual
 250  *      memory object based on one of the memory object ports
 251  *      must also lock the cache.
 252  *
 253  *      Ideally, the object cache should be more isolated
 254  *      from the reference mechanism, so that the lock need
 255  *      not be held to make simple references.
 256  */
 257 static queue_head_t     vm_object_cached_list;
 258 static int              vm_object_cached_count=0;
 259 static int              vm_object_cached_high;  /* highest # cached objects */
 260 static int              vm_object_cached_max = 512;     /* may be patched*/
 261
 262 static decl_mutex_data(,vm_object_cached_lock_data)
 263
 264 #define vm_object_cache_lock()          \
 265                 mutex_lock(&vm_object_cached_lock_data)
 266 #define vm_object_cache_lock_try()      \
 267                 mutex_try(&vm_object_cached_lock_data)
 268 #define vm_object_cache_unlock()        \
 269                 mutex_unlock(&vm_object_cached_lock_data)
 270
 271 #define VM_OBJECT_HASH_COUNT            1024
 272 static queue_head_t     vm_object_hashtable[VM_OBJECT_HASH_COUNT];
 273 static struct zone              *vm_object_hash_zone;
 274
 275 struct vm_object_hash_entry {
 276         queue_chain_t           hash_link;      /* hash chain link */
 277         memory_object_t pager;          /* pager we represent */
 278         vm_object_t             object;         /* corresponding object */
 279         boolean_t               waiting;        /* someone waiting for
 280                                                  * termination */
 281 };
 282
 283 typedef struct vm_object_hash_entry     *vm_object_hash_entry_t;
 284 #define VM_OBJECT_HASH_ENTRY_NULL       ((vm_object_hash_entry_t) 0)
 285
 286 #define VM_OBJECT_HASH_SHIFT    8
 287 #define vm_object_hash(pager) \
 288         ((((unsigned)pager) >> VM_OBJECT_HASH_SHIFT) % VM_OBJECT_HASH_COUNT)
 289
 290 void vm_object_hash_entry_free(
 291         vm_object_hash_entry_t  entry);
 292
 293 static void vm_object_reap(vm_object_t object);
 294 static void vm_object_reap_async(vm_object_t object);
 295 static void vm_object_reaper_thread(void);
 296 static queue_head_t vm_object_reaper_queue; /* protected by vm_object_cache_lock() */
 297 unsigned int vm_object_reap_count = 0;
 298 unsigned int vm_object_reap_count_async = 0;
 299
 300 /*
 301  *      vm_object_hash_lookup looks up a pager in the hashtable
 302  *      and returns the corresponding entry, with optional removal.
 303  */
 304
 305 static vm_object_hash_entry_t
 306 vm_object_hash_lookup(
 307         memory_object_t pager,
 308         boolean_t       remove_entry)
 309 {
 310         register queue_t                bucket;
 311         register vm_object_hash_entry_t entry;
 312
 313         bucket = &vm_object_hashtable[vm_object_hash(pager)];
 314
 315         entry = (vm_object_hash_entry_t)queue_first(bucket);
 316         while (!queue_end(bucket, (queue_entry_t)entry)) {
 317                 if (entry->pager == pager && !remove_entry)
 318                         return(entry);
 319                 else if (entry->pager == pager) {
 320                         queue_remove(bucket, entry,
 321                                         vm_object_hash_entry_t, hash_link);
 322                         return(entry);
 323                 }
 324
 325                 entry = (vm_object_hash_entry_t)queue_next(&entry->hash_link);
 326         }
 327
 328         return(VM_OBJECT_HASH_ENTRY_NULL);
 329 }
 330
 331 /*
 332  *      vm_object_hash_enter enters the specified
 333  *      pager / cache object association in the hashtable.
 334  */
 335
 336 static void
 337 vm_object_hash_insert(
 338         vm_object_hash_entry_t  entry)
 339 {
 340         register queue_t                bucket;
 341
 342         bucket = &vm_object_hashtable[vm_object_hash(entry->pager)];
 343
 344         queue_enter(bucket, entry, vm_object_hash_entry_t, hash_link);
 345 }
 346
 347 static vm_object_hash_entry_t
 348 vm_object_hash_entry_alloc(
 349         memory_object_t pager)
 350 {
 351         vm_object_hash_entry_t  entry;
 352
 353         entry = (vm_object_hash_entry_t)zalloc(vm_object_hash_zone);
 354         entry->pager = pager;
 355         entry->object = VM_OBJECT_NULL;
 356         entry->waiting = FALSE;
 357
 358         return(entry);
 359 }
 360
 361 void
 362 vm_object_hash_entry_free(
 363         vm_object_hash_entry_t  entry)
 364 {
 365         zfree(vm_object_hash_zone, entry);
 366 }
 367
 368 /*
 369  *      vm_object_allocate:
 370  *
 371  *      Returns a new object with the given size.
 372  */
 373
 374 __private_extern__ void
 375 _vm_object_allocate(
 376         vm_object_size_t        size,
 377         vm_object_t             object)
 378 {
 379         XPR(XPR_VM_OBJECT,
 380                 "vm_object_allocate, object 0x%X size 0x%X\n",
 381                 (integer_t)object, size, 0,0,0);
 382
 383         *object = vm_object_template;
 384         queue_init(&object->memq);
 385         queue_init(&object->msr_q);
 386 #ifdef UPL_DEBUG
 387         queue_init(&object->uplq);
 388 #endif /* UPL_DEBUG */
 389         vm_object_lock_init(object);
 390         object->size = size;
 391 }
 392
 393 __private_extern__ vm_object_t
 394 vm_object_allocate(
 395         vm_object_size_t        size)
 396 {
 397         register vm_object_t object;
 398
 399         object = (vm_object_t) zalloc(vm_object_zone);
 400
 401 //      dbgLog(object, size, 0, 2);                     /* (TEST/DEBUG) */
 402
 403         if (object != VM_OBJECT_NULL)
 404                 _vm_object_allocate(size, object);
 405
 406         return object;
 407 }
 408
 409
 410 lck_grp_t       vm_object_lck_grp;
 411 lck_grp_attr_t  vm_object_lck_grp_attr;
 412 lck_attr_t      vm_object_lck_attr;
 413 lck_attr_t      kernel_object_lck_attr;
 414
 415 /*
 416  *      vm_object_bootstrap:
 417  *
 418  *      Initialize the VM objects module.
 419  */
 420 __private_extern__ void
 421 vm_object_bootstrap(void)
 422 {
 423         register int    i;
 424
 425         vm_object_zone = zinit((vm_size_t) sizeof(struct vm_object),
 426                                 round_page_32(512*1024),
 427                                 round_page_32(12*1024),
 428                                 "vm objects");
 429
 430         queue_init(&vm_object_reaper_queue);
 431         queue_init(&vm_object_cached_list);
 432         mutex_init(&vm_object_cached_lock_data, 0);
 433
 434         vm_object_hash_zone =
 435                         zinit((vm_size_t) sizeof (struct vm_object_hash_entry),
 436                               round_page_32(512*1024),
 437                               round_page_32(12*1024),
 438                               "vm object hash entries");
 439
 440         for (i = 0; i < VM_OBJECT_HASH_COUNT; i++)
 441                 queue_init(&vm_object_hashtable[i]);
 442
 443         vm_object_init_lck_grp();
 444
 445         /*
 446          *      Fill in a template object, for quick initialization
 447          */
 448
 449         /* memq; Lock; init after allocation */
 450         vm_object_template.memq.prev = NULL;
 451         vm_object_template.memq.next = NULL;
 452 #if 0
 453         /*
 454          * We can't call vm_object_lock_init() here because that will
 455          * allocate some memory and VM is not fully initialized yet.
 456          * The lock will be initialized for each allocate object in
 457          * _vm_object_allocate(), so we don't need to initialize it in
 458          * the vm_object_template.
 459          */
 460         vm_object_lock_init(&vm_object_template);
 461 #endif
 462         vm_object_template.size = 0;
 463         vm_object_template.memq_hint = VM_PAGE_NULL;
 464         vm_object_template.ref_count = 1;
 465 #if     TASK_SWAPPER
 466         vm_object_template.res_count = 1;
 467 #endif  /* TASK_SWAPPER */
 468         vm_object_template.resident_page_count = 0;
 469         vm_object_template.copy = VM_OBJECT_NULL;
 470         vm_object_template.shadow = VM_OBJECT_NULL;
 471         vm_object_template.shadow_offset = (vm_object_offset_t) 0;
 472         vm_object_template.pager = MEMORY_OBJECT_NULL;
 473         vm_object_template.paging_offset = 0;
 474         vm_object_template.pager_control = MEMORY_OBJECT_CONTROL_NULL;
 475         vm_object_template.copy_strategy = MEMORY_OBJECT_COPY_SYMMETRIC;
 476         vm_object_template.paging_in_progress = 0;
 477
 478         /* Begin bitfields */
 479         vm_object_template.all_wanted = 0; /* all bits FALSE */
 480         vm_object_template.pager_created = FALSE;
 481         vm_object_template.pager_initialized = FALSE;
 482         vm_object_template.pager_ready = FALSE;
 483         vm_object_template.pager_trusted = FALSE;
 484         vm_object_template.can_persist = FALSE;
 485         vm_object_template.internal = TRUE;
 486         vm_object_template.temporary = TRUE;
 487         vm_object_template.private = FALSE;
 488         vm_object_template.pageout = FALSE;
 489         vm_object_template.alive = TRUE;
 490         vm_object_template.purgable = VM_PURGABLE_DENY;
 491         vm_object_template.shadowed = FALSE;
 492         vm_object_template.silent_overwrite = FALSE;
 493         vm_object_template.advisory_pageout = FALSE;
 494         vm_object_template.true_share = FALSE;
 495         vm_object_template.terminating = FALSE;
 496         vm_object_template.named = FALSE;
 497         vm_object_template.shadow_severed = FALSE;
 498         vm_object_template.phys_contiguous = FALSE;
 499         vm_object_template.nophyscache = FALSE;
 500         /* End bitfields */
 501
 502         vm_object_template.cached_list.prev = NULL;
 503         vm_object_template.cached_list.next = NULL;
 504         vm_object_template.msr_q.prev = NULL;
 505         vm_object_template.msr_q.next = NULL;
 506
 507         vm_object_template.last_alloc = (vm_object_offset_t) 0;
 508         vm_object_template.sequential = (vm_object_offset_t) 0;
 509         vm_object_template.pages_created = 0;
 510         vm_object_template.pages_used = 0;
 511
 512 #if     MACH_PAGEMAP
 513         vm_object_template.existence_map = VM_EXTERNAL_NULL;
 514 #endif  /* MACH_PAGEMAP */
 515         vm_object_template.cow_hint = ~(vm_offset_t)0;
 516 #if     MACH_ASSERT
 517         vm_object_template.paging_object = VM_OBJECT_NULL;
 518 #endif  /* MACH_ASSERT */
 519
 520         /* cache bitfields */
 521         vm_object_template.wimg_bits = VM_WIMG_DEFAULT;
 522         vm_object_template.code_signed = FALSE;
 523         vm_object_template.not_in_use = 0;
 524 #ifdef UPL_DEBUG
 525         vm_object_template.uplq.prev = NULL;
 526         vm_object_template.uplq.next = NULL;
 527 #endif /* UPL_DEBUG */
 528 #ifdef VM_PIP_DEBUG
 529         bzero(&vm_object_template.pip_holders,
 530               sizeof (vm_object_template.pip_holders));
 531 #endif /* VM_PIP_DEBUG */
 532
 533         vm_object_template.objq.next=NULL;
 534         vm_object_template.objq.prev=NULL;
 535
 536
 537         /*
 538          *      Initialize the "kernel object"
 539          */
 540
 541         kernel_object = &kernel_object_store;
 542
 543 /*
 544  *      Note that in the following size specifications, we need to add 1 because
 545  *      VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size.
 546  */
 547
 548 #ifdef ppc
 549         _vm_object_allocate((vm_last_addr - VM_MIN_KERNEL_ADDRESS) + 1,
 550                         kernel_object);
 551 #else
 552         _vm_object_allocate((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) + 1,
 553                         kernel_object);
 554 #endif
 555         kernel_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 556
 557         /*
 558          *      Initialize the "submap object".  Make it as large as the
 559          *      kernel object so that no limit is imposed on submap sizes.
 560          */
 561
 562         vm_submap_object = &vm_submap_object_store;
 563 #ifdef ppc
 564         _vm_object_allocate((vm_last_addr - VM_MIN_KERNEL_ADDRESS) + 1,
 565                         vm_submap_object);
 566 #else
 567         _vm_object_allocate((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) + 1,
 568                         vm_submap_object);
 569 #endif
 570         vm_submap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 571
 572         /*
 573          * Create an "extra" reference to this object so that we never
 574          * try to deallocate it; zfree doesn't like to be called with
 575          * non-zone memory.
 576          */
 577         vm_object_reference(vm_submap_object);
 578
 579 #if     MACH_PAGEMAP
 580         vm_external_module_initialize();
 581 #endif  /* MACH_PAGEMAP */
 582 }
 583
 584 void
 585 vm_object_reaper_init(void)
 586 {
 587         kern_return_t   kr;
 588         thread_t        thread;
 589
 590         kr = kernel_thread_start_priority(
 591                 (thread_continue_t) vm_object_reaper_thread,
 592                 NULL,
 593                 BASEPRI_PREEMPT - 1,
 594                 &thread);
 595         if (kr != KERN_SUCCESS) {
 596                 panic("failed to launch vm_object_reaper_thread kr=0x%x", kr);
 597         }
 598         thread_deallocate(thread);
 599 }
 600
 601 __private_extern__ void
 602 vm_object_init(void)
 603 {
 604         /*
 605          *      Finish initializing the kernel object.
 606          */
 607 }
 608
 609
 610 __private_extern__ void
 611 vm_object_init_lck_grp(void)
 612 {
 613         /*
 614          * initialze the vm_object lock world
 615          */
 616         lck_grp_attr_setdefault(&vm_object_lck_grp_attr);
 617         lck_grp_init(&vm_object_lck_grp, "vm_object", &vm_object_lck_grp_attr);
 618         lck_attr_setdefault(&vm_object_lck_attr);
 619         lck_attr_setdefault(&kernel_object_lck_attr);
 620         lck_attr_cleardebug(&kernel_object_lck_attr);
 621 }
 622
 623
 624 #define MIGHT_NOT_CACHE_SHADOWS         1
 625 #if     MIGHT_NOT_CACHE_SHADOWS
 626 static int cache_shadows = TRUE;
 627 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
 628
 629 /*
 630  *      vm_object_deallocate:
 631  *
 632  *      Release a reference to the specified object,
 633  *      gained either through a vm_object_allocate
 634  *      or a vm_object_reference call.  When all references
 635  *      are gone, storage associated with this object
 636  *      may be relinquished.
 637  *
 638  *      No object may be locked.
 639  */
 640 unsigned long vm_object_deallocate_shared_successes = 0;
 641 unsigned long vm_object_deallocate_shared_failures = 0;
 642 unsigned long vm_object_deallocate_shared_swap_failures = 0;
 643 __private_extern__ void
 644 vm_object_deallocate(
 645         register vm_object_t    object)
 646 {
 647         boolean_t       retry_cache_trim = FALSE;
 648         vm_object_t     shadow = VM_OBJECT_NULL;
 649         uint32_t        try_failed_count = 0;
 650
 651 //      if(object)dbgLog(object, object->ref_count, object->can_persist, 3);    /* (TEST/DEBUG) */
 652 //      else dbgLog(object, 0, 0, 3);   /* (TEST/DEBUG) */
 653
 654         if (object == VM_OBJECT_NULL)
 655                 return;
 656
 657         if (object == kernel_object) {
 658                 vm_object_lock(kernel_object);
 659                 kernel_object->ref_count--;
 660                 if (kernel_object->ref_count == 0) {
 661                         panic("vm_object_deallocate: losing kernel_object\n");
 662                 }
 663                 vm_object_unlock(kernel_object);
 664                 return;
 665         }
 666
 667         if (object->ref_count > 2 ||
 668             (!object->named && object->ref_count > 1)) {
 669                 UInt32          original_ref_count;
 670                 volatile UInt32 *ref_count_p;
 671                 Boolean         atomic_swap;
 672
 673                 /*
 674                  * The object currently looks like it is not being
 675                  * kept alive solely by the reference we're about to release.
 676                  * Let's try and release our reference without taking
 677                  * all the locks we would need if we had to terminate the
 678                  * object (cache lock + exclusive object lock).
 679                  * Lock the object "shared" to make sure we don't race with
 680                  * anyone holding it "exclusive".
 681                  */
 682                 vm_object_lock_shared(object);
 683                 ref_count_p = (volatile UInt32 *) &object->ref_count;
 684                 original_ref_count = object->ref_count;
 685                 /*
 686                  * Test again as "ref_count" could have changed.
 687                  * "named" shouldn't change.
 688                  */
 689                 if (original_ref_count > 2 ||
 690                     (!object->named && original_ref_count > 1)) {
 691                         atomic_swap = OSCompareAndSwap(
 692                                 original_ref_count,
 693                                 original_ref_count - 1,
 694                                 (UInt32 *) &object->ref_count);
 695                         if (atomic_swap == FALSE) {
 696                                 vm_object_deallocate_shared_swap_failures++;
 697                         }
 698
 699                 } else {
 700                         atomic_swap = FALSE;
 701                 }
 702                 vm_object_unlock(object);
 703
 704                 if (atomic_swap) {
 705                         /* ref_count was updated atomically ! */
 706                         vm_object_deallocate_shared_successes++;
 707                         return;
 708                 }
 709
 710                 /*
 711                  * Someone else updated the ref_count at the same
 712                  * time and we lost the race.  Fall back to the usual
 713                  * slow but safe path...
 714                  */
 715                 vm_object_deallocate_shared_failures++;
 716         }
 717
 718         while (object != VM_OBJECT_NULL) {
 719
 720                 /*
 721                  *      The cache holds a reference (uncounted) to
 722                  *      the object; we must lock it before removing
 723                  *      the object.
 724                  */
 725                 for (;;) {
 726                         vm_object_cache_lock();
 727
 728                         /*
 729                          * if we try to take a regular lock here
 730                          * we risk deadlocking against someone
 731                          * holding a lock on this object while
 732                          * trying to vm_object_deallocate a different
 733                          * object
 734                          */
 735                         if (vm_object_lock_try(object))
 736                                 break;
 737                         vm_object_cache_unlock();
 738                         try_failed_count++;
 739
 740                         mutex_pause(try_failed_count);  /* wait a bit */
 741                 }
 742                 assert(object->ref_count > 0);
 743
 744                 /*
 745                  *      If the object has a named reference, and only
 746                  *      that reference would remain, inform the pager
 747                  *      about the last "mapping" reference going away.
 748                  */
 749                 if ((object->ref_count == 2)  && (object->named)) {
 750                         memory_object_t pager = object->pager;
 751
 752                         /* Notify the Pager that there are no */
 753                         /* more mappers for this object */
 754
 755                         if (pager != MEMORY_OBJECT_NULL) {
 756                                 vm_object_unlock(object);
 757                                 vm_object_cache_unlock();
 758
 759                                 memory_object_unmap(pager);
 760
 761                                 try_failed_count = 0;
 762                                 for (;;) {
 763                                         vm_object_cache_lock();
 764
 765                                         /*
 766                                          * if we try to take a regular lock here
 767                                          * we risk deadlocking against someone
 768                                          * holding a lock on this object while
 769                                          * trying to vm_object_deallocate a different
 770                                          * object
 771                                          */
 772                                         if (vm_object_lock_try(object))
 773                                                 break;
 774                                         vm_object_cache_unlock();
 775                                         try_failed_count++;
 776
 777                                         mutex_pause(try_failed_count);  /* wait a bit */
 778                                 }
 779                                 assert(object->ref_count > 0);
 780                         }
 781                 }
 782
 783                 /*
 784                  *      Lose the reference. If other references
 785                  *      remain, then we are done, unless we need
 786                  *      to retry a cache trim.
 787                  *      If it is the last reference, then keep it
 788                  *      until any pending initialization is completed.
 789                  */
 790
 791                 /* if the object is terminating, it cannot go into */
 792                 /* the cache and we obviously should not call      */
 793                 /* terminate again.  */
 794
 795                 if ((object->ref_count > 1) || object->terminating) {
 796                         vm_object_lock_assert_exclusive(object);
 797                         object->ref_count--;
 798                         vm_object_res_deallocate(object);
 799                         vm_object_cache_unlock();
 800
 801                         if (object->ref_count == 1 &&
 802                             object->shadow != VM_OBJECT_NULL) {
 803                                 /*
 804                                  * There's only one reference left on this
 805                                  * VM object.  We can't tell if it's a valid
 806                                  * one (from a mapping for example) or if this
 807                                  * object is just part of a possibly stale and
 808                                  * useless shadow chain.
 809                                  * We would like to try and collapse it into
 810                                  * its parent, but we don't have any pointers
 811                                  * back to this parent object.
 812                                  * But we can try and collapse this object with
 813                                  * its own shadows, in case these are useless
 814                                  * too...
 815                                  * We can't bypass this object though, since we
 816                                  * don't know if this last reference on it is
 817                                  * meaningful or not.
 818                                  */
 819                                 vm_object_collapse(object, 0, FALSE);
 820                         }
 821
 822                         vm_object_unlock(object);
 823                         if (retry_cache_trim &&
 824                             ((object = vm_object_cache_trim(TRUE)) !=
 825                              VM_OBJECT_NULL)) {
 826                                 continue;
 827                         }
 828                         return;
 829                 }
 830
 831                 /*
 832                  *      We have to wait for initialization
 833                  *      before destroying or caching the object.
 834                  */
 835
 836                 if (object->pager_created && ! object->pager_initialized) {
 837                         assert(! object->can_persist);
 838                         vm_object_assert_wait(object,
 839                                               VM_OBJECT_EVENT_INITIALIZED,
 840                                               THREAD_UNINT);
 841                         vm_object_unlock(object);
 842                         vm_object_cache_unlock();
 843                         thread_block(THREAD_CONTINUE_NULL);
 844                         continue;
 845                 }
 846
 847                 /*
 848                  *      If this object can persist, then enter it in
 849                  *      the cache. Otherwise, terminate it.
 850                  *
 851                  *      NOTE:  Only permanent objects are cached, and
 852                  *      permanent objects cannot have shadows.  This
 853                  *      affects the residence counting logic in a minor
 854                  *      way (can do it in-line, mostly).
 855                  */
 856
 857                 if ((object->can_persist) && (object->alive)) {
 858                         /*
 859                          *      Now it is safe to decrement reference count,
 860                          *      and to return if reference count is > 0.
 861                          */
 862                         vm_object_lock_assert_exclusive(object);
 863                         if (--object->ref_count > 0) {
 864                                 vm_object_res_deallocate(object);
 865                                 vm_object_unlock(object);
 866                                 vm_object_cache_unlock();
 867                                 if (retry_cache_trim &&
 868                                     ((object = vm_object_cache_trim(TRUE)) !=
 869                                      VM_OBJECT_NULL)) {
 870                                         continue;
 871                                 }
 872                                 return;
 873                         }
 874
 875 #if     MIGHT_NOT_CACHE_SHADOWS
 876                         /*
 877                          *      Remove shadow now if we don't
 878                          *      want to cache shadows.
 879                          */
 880                         if (! cache_shadows) {
 881                                 shadow = object->shadow;
 882                                 object->shadow = VM_OBJECT_NULL;
 883                         }
 884 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
 885
 886                         /*
 887                          *      Enter the object onto the queue of
 888                          *      cached objects, and deactivate
 889                          *      all of its pages.
 890                          */
 891                         assert(object->shadow == VM_OBJECT_NULL);
 892                         VM_OBJ_RES_DECR(object);
 893                         XPR(XPR_VM_OBJECT,
 894                       "vm_o_deallocate: adding %x to cache, queue = (%x, %x)\n",
 895                                 (integer_t)object,
 896                                 (integer_t)vm_object_cached_list.next,
 897                                 (integer_t)vm_object_cached_list.prev,0,0);
 898
 899                         vm_object_cached_count++;
 900                         if (vm_object_cached_count > vm_object_cached_high)
 901                                 vm_object_cached_high = vm_object_cached_count;
 902                         queue_enter(&vm_object_cached_list, object,
 903                                 vm_object_t, cached_list);
 904                         vm_object_cache_unlock();
 905                         vm_object_deactivate_all_pages(object);
 906                         vm_object_unlock(object);
 907
 908 #if     MIGHT_NOT_CACHE_SHADOWS
 909                         /*
 910                          *      If we have a shadow that we need
 911                          *      to deallocate, do so now, remembering
 912                          *      to trim the cache later.
 913                          */
 914                         if (! cache_shadows && shadow != VM_OBJECT_NULL) {
 915                                 object = shadow;
 916                                 retry_cache_trim = TRUE;
 917                                 continue;
 918                         }
 919 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
 920
 921                         /*
 922                          *      Trim the cache. If the cache trim
 923                          *      returns with a shadow for us to deallocate,
 924                          *      then remember to retry the cache trim
 925                          *      when we are done deallocating the shadow.
 926                          *      Otherwise, we are done.
 927                          */
 928
 929                         object = vm_object_cache_trim(TRUE);
 930                         if (object == VM_OBJECT_NULL) {
 931                                 return;
 932                         }
 933                         retry_cache_trim = TRUE;
 934
 935                 } else {
 936                         /*
 937                          *      This object is not cachable; terminate it.
 938                          */
 939                         XPR(XPR_VM_OBJECT,
 940          "vm_o_deallocate: !cacheable 0x%X res %d paging_ops %d thread 0x%p ref %d\n",
 941                             (integer_t)object, object->resident_page_count,
 942                             object->paging_in_progress,
 943                             (void *)current_thread(),object->ref_count);
 944
 945                         VM_OBJ_RES_DECR(object);        /* XXX ? */
 946                         /*
 947                          *      Terminate this object. If it had a shadow,
 948                          *      then deallocate it; otherwise, if we need
 949                          *      to retry a cache trim, do so now; otherwise,
 950                          *      we are done. "pageout" objects have a shadow,
 951                          *      but maintain a "paging reference" rather than
 952                          *      a normal reference.
 953                          */
 954                         shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
 955                         if(vm_object_terminate(object) != KERN_SUCCESS) {
 956                                 return;
 957                         }
 958                         if (shadow != VM_OBJECT_NULL) {
 959                                 object = shadow;
 960                                 continue;
 961                         }
 962                         if (retry_cache_trim &&
 963                             ((object = vm_object_cache_trim(TRUE)) !=
 964                              VM_OBJECT_NULL)) {
 965                                 continue;
 966                         }
 967                         return;
 968                 }
 969         }
 970         assert(! retry_cache_trim);
 971 }
 972
 973 /*
 974  *      Check to see whether we really need to trim
 975  *      down the cache. If so, remove an object from
 976  *      the cache, terminate it, and repeat.
 977  *
 978  *      Called with, and returns with, cache lock unlocked.
 979  */
 980 vm_object_t
 981 vm_object_cache_trim(
 982         boolean_t called_from_vm_object_deallocate)
 983 {
 984         register vm_object_t object = VM_OBJECT_NULL;
 985         vm_object_t shadow;
 986
 987         for (;;) {
 988
 989                 /*
 990                  *      If we no longer need to trim the cache,
 991                  *      then we are done.
 992                  */
 993
 994                 vm_object_cache_lock();
 995                 if (vm_object_cached_count <= vm_object_cached_max) {
 996                         vm_object_cache_unlock();
 997                         return VM_OBJECT_NULL;
 998                 }
 999
1000                 /*
1001                  *      We must trim down the cache, so remove
1002                  *      the first object in the cache.
1003                  */
1004                 XPR(XPR_VM_OBJECT,
1005                 "vm_object_cache_trim: removing from front of cache (%x, %x)\n",
1006                         (integer_t)vm_object_cached_list.next,
1007                         (integer_t)vm_object_cached_list.prev, 0, 0, 0);
1008
1009                 object = (vm_object_t) queue_first(&vm_object_cached_list);
1010                 if(object == (vm_object_t) &vm_object_cached_list) {
1011                         /* something's wrong with the calling parameter or */
1012                         /* the value of vm_object_cached_count, just fix   */
1013                         /* and return */
1014                         if(vm_object_cached_max < 0)
1015                                 vm_object_cached_max = 0;
1016                         vm_object_cached_count = 0;
1017                         vm_object_cache_unlock();
1018                         return VM_OBJECT_NULL;
1019                 }
1020                 vm_object_lock(object);
1021                 queue_remove(&vm_object_cached_list, object, vm_object_t,
1022                              cached_list);
1023                 vm_object_cached_count--;
1024
1025                 /*
1026                  *      Since this object is in the cache, we know
1027                  *      that it is initialized and has no references.
1028                  *      Take a reference to avoid recursive deallocations.
1029                  */
1030
1031                 assert(object->pager_initialized);
1032                 assert(object->ref_count == 0);
1033                 vm_object_lock_assert_exclusive(object);
1034                 object->ref_count++;
1035
1036                 /*
1037                  *      Terminate the object.
1038                  *      If the object had a shadow, we let vm_object_deallocate
1039                  *      deallocate it. "pageout" objects have a shadow, but
1040                  *      maintain a "paging reference" rather than a normal
1041                  *      reference.
1042                  *      (We are careful here to limit recursion.)
1043                  */
1044                 shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
1045                 if(vm_object_terminate(object) != KERN_SUCCESS)
1046                         continue;
1047                 if (shadow != VM_OBJECT_NULL) {
1048                         if (called_from_vm_object_deallocate) {
1049                                 return shadow;
1050                         } else {
1051                                 vm_object_deallocate(shadow);
1052                         }
1053                 }
1054         }
1055 }
1056
1057 #define VM_OBJ_TERM_STATS DEBUG
1058 #if VM_OBJ_TERM_STATS
1059 uint32_t vm_object_terminate_pages_freed = 0;
1060 uint32_t vm_object_terminate_pages_removed = 0;
1061 uint32_t vm_object_terminate_batches = 0;
1062 uint32_t vm_object_terminate_biggest_batch = 0;
1063 #endif /* VM_OBJ_TERM_STATS */
1064
1065 #define V_O_T_MAX_BATCH 256
1066
1067 /*
1068  *      Routine:        vm_object_terminate
1069  *      Purpose:
1070  *              Free all resources associated with a vm_object.
1071  *      In/out conditions:
1072  *              Upon entry, the object must be locked,
1073  *              and the object must have exactly one reference.
1074  *
1075  *              The shadow object reference is left alone.
1076  *
1077  *              The object must be unlocked if its found that pages
1078  *              must be flushed to a backing object.  If someone
1079  *              manages to map the object while it is being flushed
1080  *              the object is returned unlocked and unchanged.  Otherwise,
1081  *              upon exit, the cache will be unlocked, and the
1082  *              object will cease to exist.
1083  */
1084 static kern_return_t
1085 vm_object_terminate(
1086         register vm_object_t    object)
1087 {
1088         register vm_page_t      p;
1089         vm_object_t             shadow_object;
1090         vm_page_t               local_free_q;
1091         int                     loop_count;
1092 #if VM_OBJ_TERM_STATS
1093         uint32_t                local_free_count;
1094         uint32_t                pages_removed;
1095 #endif /* VM_OBJ_TERM_STATS */
1096
1097 #if VM_OBJ_TERM_STATS
1098 #define VM_OBJ_TERM_FREELIST_DEBUG(_pages_removed, _local_free_count)   \
1099         MACRO_BEGIN                                                     \
1100                 if (_pages_removed) {                                   \
1101                         hw_atomic_add(&vm_object_terminate_batches, 1); \
1102                         hw_atomic_add(&vm_object_terminate_pages_removed, \
1103                                       _pages_removed);                  \
1104                         hw_atomic_add(&vm_object_terminate_pages_freed, \
1105                                       _local_free_count);               \
1106                         if (_local_free_count >                         \
1107                             vm_object_terminate_biggest_batch) {        \
1108                                 vm_object_terminate_biggest_batch =     \
1109                                         _local_free_count;              \
1110                         }                                               \
1111                         _local_free_count = 0;                          \
1112                 }                                                       \
1113         MACRO_END
1114 #else /* VM_OBJ_TERM_STATS */
1115 #define VM_OBJ_TERM_FREELIST_DEBUG(_pages_removed, _local_free_count)
1116 #endif /* VM_OBJ_TERM_STATS */
1117
1118 #define VM_OBJ_TERM_FREELIST(_pages_removed, _local_free_count, _local_free_q) \
1119         MACRO_BEGIN                                                     \
1120                 VM_OBJ_TERM_FREELIST_DEBUG(_pages_removed, _local_free_count); \
1121                 if (_local_free_q) {                                    \
1122                         vm_page_free_list(_local_free_q);               \
1123                         _local_free_q = VM_PAGE_NULL;                   \
1124                 }                                                       \
1125         MACRO_END
1126
1127
1128
1129         XPR(XPR_VM_OBJECT, "vm_object_terminate, object 0x%X ref %d\n",
1130                 (integer_t)object, object->ref_count, 0, 0, 0);
1131
1132         local_free_q = VM_PAGE_NULL;
1133 #if VM_OBJ_TERM_STATS
1134         local_free_count = 0;
1135         pages_removed = 0;
1136 #endif /* VM_OBJ_TERM_STATS */
1137
1138         if (!object->pageout && (!object->temporary || object->can_persist)
1139                         && (object->pager != NULL || object->shadow_severed)) {
1140            vm_object_cache_unlock();
1141            loop_count = V_O_T_MAX_BATCH;
1142            vm_page_lock_queues();
1143            while (!queue_empty(&object->memq)) {
1144                 if (--loop_count == 0) {
1145                         /*
1146                          * Free the pages we've reclaimed so far and
1147                          * take a little break to avoid hogging
1148                          * the page queues lock too long.
1149                          */
1150                         VM_OBJ_TERM_FREELIST(pages_removed,
1151                                              local_free_count,
1152                                              local_free_q);
1153                         mutex_yield(&vm_page_queue_lock);
1154                         loop_count = V_O_T_MAX_BATCH;
1155                 }
1156                 /*
1157                  * Clear pager_trusted bit so that the pages get yanked
1158                  * out of the object instead of cleaned in place.  This
1159                  * prevents a deadlock in XMM and makes more sense anyway.
1160                  */
1161                 object->pager_trusted = FALSE;
1162
1163                 p = (vm_page_t) queue_first(&object->memq);
1164
1165                 VM_PAGE_CHECK(p);
1166
1167                 if (p->busy || p->cleaning) {
1168                         if(p->cleaning || p->absent) {
1169                                 /* free the pages reclaimed so far */
1170                                 VM_OBJ_TERM_FREELIST(pages_removed,
1171                                                      local_free_count,
1172                                                      local_free_q);
1173                                 vm_page_unlock_queues();
1174                                 vm_object_paging_wait(object, THREAD_UNINT);
1175                                 vm_page_lock_queues();
1176                                 continue;
1177                         } else {
1178                                 panic("vm_object_terminate.3 %p %p", object, p);
1179                         }
1180                 }
1181
1182                 p->busy = TRUE;
1183                 VM_PAGE_QUEUES_REMOVE(p);
1184 #if VM_OBJ_TERM_STATS
1185                 pages_removed++;
1186 #endif /* VM_OBJ_TERM_STATS */
1187
1188                 if (p->absent || p->private) {
1189
1190                         /*
1191                          *      For private pages, VM_PAGE_FREE just
1192                          *      leaves the page structure around for
1193                          *      its owner to clean up.  For absent
1194                          *      pages, the structure is returned to
1195                          *      the appropriate pool.
1196                          */
1197
1198                         goto free_page;
1199                 }
1200
1201                 if (p->fictitious) {
1202                         if (p->phys_page == vm_page_guard_addr) {
1203                                 goto free_page;
1204                         }
1205                         panic("vm_object_terminate.4 %p %p", object, p);
1206                 }
1207
1208                 if (!p->dirty && p->wpmapped)
1209                         p->dirty = pmap_is_modified(p->phys_page);
1210
1211                 if ((p->dirty || p->precious) && !p->error && object->alive) {
1212                         /* free the pages reclaimed so far */
1213                         VM_OBJ_TERM_FREELIST(pages_removed,
1214                                              local_free_count,
1215                                              local_free_q);
1216                         vm_page_unlock_queues();
1217                         vm_pageout_cluster(p); /* flush page */
1218                         vm_object_paging_wait(object, THREAD_UNINT);
1219                         XPR(XPR_VM_OBJECT,
1220                             "vm_object_terminate restart, object 0x%X ref %d\n",
1221                             (integer_t)object, object->ref_count, 0, 0, 0);
1222                         vm_page_lock_queues();
1223                 } else {
1224                     free_page:
1225                         /*
1226                          * Add this page to our list of reclaimed pages,
1227                          * to be freed later.
1228                          */
1229                         vm_page_free_prepare(p);
1230                         p->pageq.next = (queue_entry_t) local_free_q;
1231                         local_free_q = p;
1232 #if VM_OBJ_TERM_STATS
1233                         local_free_count++;
1234 #endif /* VM_OBJ_TERM_STATS */
1235                 }
1236            }
1237
1238            /*
1239             * Free the remaining reclaimed pages.
1240             */
1241            VM_OBJ_TERM_FREELIST(pages_removed,
1242                                 local_free_count,
1243                                 local_free_q);
1244            vm_page_unlock_queues();
1245            vm_object_unlock(object);
1246            vm_object_cache_lock();
1247            vm_object_lock(object);
1248         }
1249
1250         /*
1251          *      Make sure the object isn't already being terminated
1252          */
1253         if(object->terminating) {
1254                 vm_object_lock_assert_exclusive(object);
1255                 object->ref_count--;
1256                 assert(object->ref_count > 0);
1257                 vm_object_cache_unlock();
1258                 vm_object_unlock(object);
1259                 return KERN_FAILURE;
1260         }
1261
1262         /*
1263          * Did somebody get a reference to the object while we were
1264          * cleaning it?
1265          */
1266         if(object->ref_count != 1) {
1267                 vm_object_lock_assert_exclusive(object);
1268                 object->ref_count--;
1269                 assert(object->ref_count > 0);
1270                 vm_object_res_deallocate(object);
1271                 vm_object_cache_unlock();
1272                 vm_object_unlock(object);
1273                 return KERN_FAILURE;
1274         }
1275
1276         /*
1277          *      Make sure no one can look us up now.
1278          */
1279
1280         object->terminating = TRUE;
1281         object->alive = FALSE;
1282         vm_object_remove(object);
1283
1284         /*
1285          *      Detach the object from its shadow if we are the shadow's
1286          *      copy. The reference we hold on the shadow must be dropped
1287          *      by our caller.
1288          */
1289         if (((shadow_object = object->shadow) != VM_OBJECT_NULL) &&
1290             !(object->pageout)) {
1291                 vm_object_lock(shadow_object);
1292                 if (shadow_object->copy == object)
1293                         shadow_object->copy = VM_OBJECT_NULL;
1294                 vm_object_unlock(shadow_object);
1295         }
1296
1297         if (object->paging_in_progress != 0) {
1298                 /*
1299                  * There are still some paging_in_progress references
1300                  * on this object, meaning that there are some paging
1301                  * or other I/O operations in progress for this VM object.
1302                  * Such operations take some paging_in_progress references
1303                  * up front to ensure that the object doesn't go away, but
1304                  * they may also need to acquire a reference on the VM object,
1305                  * to map it in kernel space, for example.  That means that
1306                  * they may end up releasing the last reference on the VM
1307                  * object, triggering its termination, while still holding
1308                  * paging_in_progress references.  Waiting for these
1309                  * pending paging_in_progress references to go away here would
1310                  * deadlock.
1311                  *
1312                  * To avoid deadlocking, we'll let the vm_object_reaper_thread
1313                  * complete the VM object termination if it still holds
1314                  * paging_in_progress references at this point.
1315                  *
1316                  * No new paging_in_progress should appear now that the
1317                  * VM object is "terminating" and not "alive".
1318                  */
1319                 vm_object_reap_async(object);
1320                 vm_object_cache_unlock();
1321                 vm_object_unlock(object);
1322                 /*
1323                  * Return KERN_FAILURE to let the caller know that we
1324                  * haven't completed the termination and it can't drop this
1325                  * object's reference on its shadow object yet.
1326                  * The reaper thread will take care of that once it has
1327                  * completed this object's termination.
1328                  */
1329                 return KERN_FAILURE;
1330         }
1331
1332         /* complete the VM object termination */
1333         vm_object_reap(object);
1334         object = VM_OBJECT_NULL;
1335         /* cache lock and object lock were released by vm_object_reap() */
1336
1337         /*
1338          * KERN_SUCCESS means that this object has been terminated
1339          * and no longer needs its shadow object but still holds a
1340          * reference on it.
1341          * The caller is responsible for dropping that reference.
1342          * We can't call vm_object_deallocate() here because that
1343          * would create a recursion.
1344          */
1345         return KERN_SUCCESS;
1346 }
1347
1348 /*
1349  * vm_object_reap():
1350  *
1351  * Complete the termination of a VM object after it's been marked
1352  * as "terminating" and "!alive" by vm_object_terminate().
1353  *
1354  * The VM object cache and the VM object must be locked by caller.
1355  * The locks will be released on return and the VM object is no longer valid.
1356  */
1357 void
1358 vm_object_reap(
1359         vm_object_t object)
1360 {
1361         memory_object_t         pager;
1362         vm_page_t               p;
1363         vm_page_t               local_free_q;
1364         int                     loop_count;
1365 #if VM_OBJ_TERM_STATS
1366         uint32_t                local_free_count;
1367 #endif /* VM_OBJ_TERM_STATS */
1368
1369 #if DEBUG
1370         mutex_assert(&vm_object_cached_lock_data, MA_OWNED);
1371 #endif /* DEBUG */
1372         vm_object_lock_assert_exclusive(object);
1373         assert(object->paging_in_progress == 0);
1374
1375         vm_object_reap_count++;
1376
1377         local_free_q = VM_PAGE_NULL;
1378 #if VM_OBJ_TERM_STATS
1379         local_free_count = 0;
1380 #endif /* VM_OBJ_TERM_STATS */
1381
1382         pager = object->pager;
1383         object->pager = MEMORY_OBJECT_NULL;
1384
1385         if (pager != MEMORY_OBJECT_NULL)
1386                 memory_object_control_disable(object->pager_control);
1387         vm_object_cache_unlock();
1388
1389         vm_object_lock_assert_exclusive(object);
1390         object->ref_count--;
1391 #if     TASK_SWAPPER
1392         assert(object->res_count == 0);
1393 #endif  /* TASK_SWAPPER */
1394
1395         assert (object->ref_count == 0);
1396
1397         /* remove from purgeable queue if it's on */
1398         if (object->objq.next || object->objq.prev) {
1399                 purgeable_q_t queue = vm_purgeable_object_remove(object);
1400                 assert(queue);
1401
1402                 /* Must take page lock for this - using it to protect token queue */
1403                 vm_page_lock_queues();
1404                 vm_purgeable_token_delete_first(queue);
1405
1406                 assert(queue->debug_count_objects>=0);
1407                 vm_page_unlock_queues();
1408         }
1409
1410         /*
1411          *      Clean or free the pages, as appropriate.
1412          *      It is possible for us to find busy/absent pages,
1413          *      if some faults on this object were aborted.
1414          */
1415         if (object->pageout) {
1416                 assert(object->shadow != VM_OBJECT_NULL);
1417
1418                 vm_pageout_object_terminate(object);
1419
1420         } else if ((object->temporary && !object->can_persist) ||
1421                    (pager == MEMORY_OBJECT_NULL)) {
1422                 loop_count = V_O_T_MAX_BATCH;
1423                 vm_page_lock_queues();
1424                 while (!queue_empty(&object->memq)) {
1425                         if (--loop_count == 0) {
1426                                 /*
1427                                  * Free the pages we reclaimed so far
1428                                  * and take a little break to avoid
1429                                  * hogging the page queue lock too long
1430                                  */
1431                                 VM_OBJ_TERM_FREELIST(local_free_count,
1432                                                      local_free_count,
1433                                                      local_free_q);
1434                                 mutex_yield(&vm_page_queue_lock);
1435                                 loop_count = V_O_T_MAX_BATCH;
1436                         }
1437                         p = (vm_page_t) queue_first(&object->memq);
1438
1439                         vm_page_free_prepare(p);
1440
1441                         assert(p->pageq.next == NULL && p->pageq.prev == NULL);
1442                         p->pageq.next = (queue_entry_t) local_free_q;
1443                         local_free_q = p;
1444 #if VM_OBJ_TERM_STATS
1445                         local_free_count++;
1446 #endif /* VM_OBJ_TERM_STATS */
1447                 }
1448                 /*
1449                  * Free the remaining reclaimed pages
1450                  */
1451                 VM_OBJ_TERM_FREELIST(local_free_count,
1452                                      local_free_count,
1453                                      local_free_q);
1454                 vm_page_unlock_queues();
1455         } else if (!queue_empty(&object->memq)) {
1456                 panic("vm_object_reap: queue just emptied isn't");
1457         }
1458
1459         assert(object->paging_in_progress == 0);
1460         assert(object->ref_count == 0);
1461
1462         /*
1463          * If the pager has not already been released by
1464          * vm_object_destroy, we need to terminate it and
1465          * release our reference to it here.
1466          */
1467         if (pager != MEMORY_OBJECT_NULL) {
1468                 vm_object_unlock(object);
1469                 vm_object_release_pager(pager);
1470                 vm_object_lock(object);
1471         }
1472
1473         /* kick off anyone waiting on terminating */
1474         object->terminating = FALSE;
1475         vm_object_paging_begin(object);
1476         vm_object_paging_end(object);
1477         vm_object_unlock(object);
1478
1479 #if     MACH_PAGEMAP
1480         vm_external_destroy(object->existence_map, object->size);
1481 #endif  /* MACH_PAGEMAP */
1482
1483         object->shadow = VM_OBJECT_NULL;
1484
1485         vm_object_lock_destroy(object);
1486         /*
1487          *      Free the space for the object.
1488          */
1489         zfree(vm_object_zone, object);
1490         object = VM_OBJECT_NULL;
1491 }
1492
1493 void
1494 vm_object_reap_async(
1495         vm_object_t     object)
1496 {
1497 #if DEBUG
1498         mutex_assert(&vm_object_cached_lock_data, MA_OWNED);
1499 #endif /* DEBUG */
1500         vm_object_lock_assert_exclusive(object);
1501
1502         vm_object_reap_count_async++;
1503
1504         /* enqueue the VM object... */
1505         queue_enter(&vm_object_reaper_queue, object,
1506                     vm_object_t, cached_list);
1507         /* ... and wake up the reaper thread */
1508         thread_wakeup((event_t) &vm_object_reaper_queue);
1509 }
1510
1511 void
1512 vm_object_reaper_thread(void)
1513 {
1514         vm_object_t     object, shadow_object;
1515
1516         vm_object_cache_lock();
1517
1518         while (!queue_empty(&vm_object_reaper_queue)) {
1519                 queue_remove_first(&vm_object_reaper_queue,
1520                                    object,
1521                                    vm_object_t,
1522                                    cached_list);
1523                 vm_object_lock(object);
1524                 assert(object->terminating);
1525                 assert(!object->alive);
1526
1527                 /*
1528                  * The pageout daemon might be playing with our pages.
1529                  * Now that the object is dead, it won't touch any more
1530                  * pages, but some pages might already be on their way out.
1531                  * Hence, we wait until the active paging activities have
1532                  * ceased before we break the association with the pager
1533                  * itself.
1534                  */
1535                 while (object->paging_in_progress != 0) {
1536                         vm_object_cache_unlock();
1537                         vm_object_wait(object,
1538                                        VM_OBJECT_EVENT_PAGING_IN_PROGRESS,
1539                                        THREAD_UNINT);
1540                         vm_object_cache_lock();
1541                         vm_object_lock(object);
1542                 }
1543
1544                 shadow_object =
1545                         object->pageout ? VM_OBJECT_NULL : object->shadow;
1546
1547                 vm_object_reap(object);
1548                 /* cache is unlocked and object is no longer valid */
1549                 object = VM_OBJECT_NULL;
1550
1551                 if (shadow_object != VM_OBJECT_NULL) {
1552                         /*
1553                          * Drop the reference "object" was holding on
1554                          * its shadow object.
1555                          */
1556                         vm_object_deallocate(shadow_object);
1557                         shadow_object = VM_OBJECT_NULL;
1558                 }
1559
1560                 vm_object_cache_lock();
1561         }
1562
1563         /* wait for more work... */
1564         assert_wait((event_t) &vm_object_reaper_queue, THREAD_UNINT);
1565         vm_object_cache_unlock();
1566         thread_block((thread_continue_t) vm_object_reaper_thread);
1567         /*NOTREACHED*/
1568 }
1569
1570 /*
1571  *      Routine:        vm_object_pager_wakeup
1572  *      Purpose:        Wake up anyone waiting for termination of a pager.
1573  */
1574
1575 static void
1576 vm_object_pager_wakeup(
1577         memory_object_t pager)
1578 {
1579         vm_object_hash_entry_t  entry;
1580         boolean_t               waiting = FALSE;
1581
1582         /*
1583          *      If anyone was waiting for the memory_object_terminate
1584          *      to be queued, wake them up now.
1585          */
1586         vm_object_cache_lock();
1587         entry = vm_object_hash_lookup(pager, TRUE);
1588         if (entry != VM_OBJECT_HASH_ENTRY_NULL)
1589                 waiting = entry->waiting;
1590         vm_object_cache_unlock();
1591         if (entry != VM_OBJECT_HASH_ENTRY_NULL) {
1592                 if (waiting)
1593                         thread_wakeup((event_t) pager);
1594                 vm_object_hash_entry_free(entry);
1595         }
1596 }
1597
1598 /*
1599  *      Routine:        vm_object_release_pager
1600  *      Purpose:        Terminate the pager and, upon completion,
1601  *                      release our last reference to it.
1602  *                      just like memory_object_terminate, except
1603  *                      that we wake up anyone blocked in vm_object_enter
1604  *                      waiting for termination message to be queued
1605  *                      before calling memory_object_init.
1606  */
1607 static void
1608 vm_object_release_pager(
1609         memory_object_t pager)
1610 {
1611
1612         /*
1613          *      Terminate the pager.
1614          */
1615
1616         (void) memory_object_terminate(pager);
1617
1618         /*
1619          *      Wakeup anyone waiting for this terminate
1620          */
1621         vm_object_pager_wakeup(pager);
1622
1623         /*
1624          *      Release reference to pager.
1625          */
1626         memory_object_deallocate(pager);
1627 }
1628
1629 /*
1630  *      Routine:        vm_object_destroy
1631  *      Purpose:
1632  *              Shut down a VM object, despite the
1633  *              presence of address map (or other) references
1634  *              to the vm_object.
1635  */
1636 kern_return_t
1637 vm_object_destroy(
1638         vm_object_t             object,
1639         __unused kern_return_t          reason)
1640 {
1641         memory_object_t         old_pager;
1642
1643         if (object == VM_OBJECT_NULL)
1644                 return(KERN_SUCCESS);
1645
1646         /*
1647          *      Remove the pager association immediately.
1648          *
1649          *      This will prevent the memory manager from further
1650          *      meddling.  [If it wanted to flush data or make
1651          *      other changes, it should have done so before performing
1652          *      the destroy call.]
1653          */
1654
1655         vm_object_cache_lock();
1656         vm_object_lock(object);
1657         object->can_persist = FALSE;
1658         object->named = FALSE;
1659         object->alive = FALSE;
1660
1661         /*
1662          *      Rip out the pager from the vm_object now...
1663          */
1664
1665         vm_object_remove(object);
1666         old_pager = object->pager;
1667         object->pager = MEMORY_OBJECT_NULL;
1668         if (old_pager != MEMORY_OBJECT_NULL)
1669                 memory_object_control_disable(object->pager_control);
1670         vm_object_cache_unlock();
1671
1672         /*
1673          * Wait for the existing paging activity (that got
1674          * through before we nulled out the pager) to subside.
1675          */
1676
1677         vm_object_paging_wait(object, THREAD_UNINT);
1678         vm_object_unlock(object);
1679
1680         /*
1681          *      Terminate the object now.
1682          */
1683         if (old_pager != MEMORY_OBJECT_NULL) {
1684                 vm_object_release_pager(old_pager);
1685
1686                 /*
1687                  * JMM - Release the caller's reference.  This assumes the
1688                  * caller had a reference to release, which is a big (but
1689                  * currently valid) assumption if this is driven from the
1690                  * vnode pager (it is holding a named reference when making
1691                  * this call)..
1692                  */
1693                 vm_object_deallocate(object);
1694
1695         }
1696         return(KERN_SUCCESS);
1697 }
1698
1699 #define VM_OBJ_DEACT_ALL_STATS DEBUG
1700 #if VM_OBJ_DEACT_ALL_STATS
1701 uint32_t vm_object_deactivate_all_pages_batches = 0;
1702 uint32_t vm_object_deactivate_all_pages_pages = 0;
1703 #endif /* VM_OBJ_DEACT_ALL_STATS */
1704 /*
1705  *      vm_object_deactivate_pages
1706  *
1707  *      Deactivate all pages in the specified object.  (Keep its pages
1708  *      in memory even though it is no longer referenced.)
1709  *
1710  *      The object must be locked.
1711  */
1712 static void
1713 vm_object_deactivate_all_pages(
1714         register vm_object_t    object)
1715 {
1716         register vm_page_t      p;
1717         int                     loop_count;
1718 #if VM_OBJ_DEACT_ALL_STATS
1719         int                     pages_count;
1720 #endif /* VM_OBJ_DEACT_ALL_STATS */
1721 #define V_O_D_A_P_MAX_BATCH     256
1722
1723         loop_count = V_O_D_A_P_MAX_BATCH;
1724 #if VM_OBJ_DEACT_ALL_STATS
1725         pages_count = 0;
1726 #endif /* VM_OBJ_DEACT_ALL_STATS */
1727         vm_page_lock_queues();
1728         queue_iterate(&object->memq, p, vm_page_t, listq) {
1729                 if (--loop_count == 0) {
1730 #if VM_OBJ_DEACT_ALL_STATS
1731                         hw_atomic_add(&vm_object_deactivate_all_pages_batches,
1732                                       1);
1733                         hw_atomic_add(&vm_object_deactivate_all_pages_pages,
1734                                       pages_count);
1735                         pages_count = 0;
1736 #endif /* VM_OBJ_DEACT_ALL_STATS */
1737                         mutex_yield(&vm_page_queue_lock);
1738                         loop_count = V_O_D_A_P_MAX_BATCH;
1739                 }
1740                 if (!p->busy && !p->throttled) {
1741 #if VM_OBJ_DEACT_ALL_STATS
1742                         pages_count++;
1743 #endif /* VM_OBJ_DEACT_ALL_STATS */
1744                         vm_page_deactivate(p);
1745                 }
1746         }
1747 #if VM_OBJ_DEACT_ALL_STATS
1748         if (pages_count) {
1749                 hw_atomic_add(&vm_object_deactivate_all_pages_batches, 1);
1750                 hw_atomic_add(&vm_object_deactivate_all_pages_pages,
1751                               pages_count);
1752                 pages_count = 0;
1753         }
1754 #endif /* VM_OBJ_DEACT_ALL_STATS */
1755         vm_page_unlock_queues();
1756 }
1757
1758 __private_extern__ void
1759 vm_object_deactivate_pages(
1760         vm_object_t             object,
1761         vm_object_offset_t      offset,
1762         vm_object_size_t        size,
1763         boolean_t               kill_page)
1764 {
1765         vm_object_t             orig_object;
1766         int pages_moved = 0;
1767         int pages_found = 0;
1768
1769         /*
1770          * entered with object lock held, acquire a paging reference to
1771          * prevent the memory_object and control ports from
1772          * being destroyed.
1773          */
1774         orig_object = object;
1775
1776         for (;;) {
1777                 register vm_page_t      m;
1778                 vm_object_offset_t      toffset;
1779                 vm_object_size_t        tsize;
1780
1781                 vm_object_paging_begin(object);
1782                 vm_page_lock_queues();
1783
1784                 for (tsize = size, toffset = offset; tsize; tsize -= PAGE_SIZE, toffset += PAGE_SIZE) {
1785
1786                         if ((m = vm_page_lookup(object, toffset)) != VM_PAGE_NULL) {
1787
1788                                 pages_found++;
1789
1790                                 if ((m->wire_count == 0) && (!m->private) && (!m->gobbled) && (!m->busy)) {
1791
1792                                         assert(!m->laundry);
1793
1794                                         m->reference = FALSE;
1795                                         pmap_clear_reference(m->phys_page);
1796
1797                                         if ((kill_page) && (object->internal)) {
1798                                                 m->precious = FALSE;
1799                                                 m->dirty = FALSE;
1800                                                 pmap_clear_modify(m->phys_page);
1801 #if     MACH_PAGEMAP
1802                                                 vm_external_state_clr(object->existence_map, offset);
1803 #endif  /* MACH_PAGEMAP */
1804                                         }
1805
1806                                         if (!m->throttled) {
1807                                                 VM_PAGE_QUEUES_REMOVE(m);
1808
1809                                                 assert(!m->laundry);
1810                                                 assert(m->object != kernel_object);
1811                                                 assert(m->pageq.next == NULL &&
1812                                                        m->pageq.prev == NULL);
1813
1814                                                 if(m->zero_fill) {
1815                                                         queue_enter_first(
1816                                                         &vm_page_queue_zf,
1817                                                         m, vm_page_t, pageq);
1818                                                         vm_zf_queue_count++;
1819                                                 } else {
1820                                                         queue_enter_first(
1821                                                                           &vm_page_queue_inactive,
1822                                                                           m, vm_page_t, pageq);
1823                                                 }
1824
1825                                                 m->inactive = TRUE;
1826                                                 if (!m->fictitious) {
1827                                                         vm_page_inactive_count++;
1828                                                         token_new_pagecount++;
1829                                                 } else {
1830                                                         assert(m->phys_page == vm_page_fictitious_addr);
1831                                                 }
1832
1833                                                 pages_moved++;
1834                                         }
1835                                 }
1836                         }
1837                 }
1838                 vm_page_unlock_queues();
1839                 vm_object_paging_end(object);
1840
1841                 if (object->shadow) {
1842                         vm_object_t     tmp_object;
1843
1844                         kill_page = 0;
1845
1846                         offset += object->shadow_offset;
1847
1848                         tmp_object = object->shadow;
1849                         vm_object_lock(tmp_object);
1850
1851                         if (object != orig_object)
1852                                 vm_object_unlock(object);
1853                         object = tmp_object;
1854                 } else
1855                         break;
1856         }
1857         if (object != orig_object)
1858                 vm_object_unlock(object);
1859 }
1860
1861 /*
1862  *      Routine:        vm_object_pmap_protect
1863  *
1864  *      Purpose:
1865  *              Reduces the permission for all physical
1866  *              pages in the specified object range.
1867  *
1868  *              If removing write permission only, it is
1869  *              sufficient to protect only the pages in
1870  *              the top-level object; only those pages may
1871  *              have write permission.
1872  *
1873  *              If removing all access, we must follow the
1874  *              shadow chain from the top-level object to
1875  *              remove access to all pages in shadowed objects.
1876  *
1877  *              The object must *not* be locked.  The object must
1878  *              be temporary/internal.
1879  *
1880  *              If pmap is not NULL, this routine assumes that
1881  *              the only mappings for the pages are in that
1882  *              pmap.
1883  */
1884
1885 __private_extern__ void
1886 vm_object_pmap_protect(
1887         register vm_object_t            object,
1888         register vm_object_offset_t     offset,
1889         vm_object_size_t                size,
1890         pmap_t                          pmap,
1891         vm_map_offset_t                 pmap_start,
1892         vm_prot_t                       prot)
1893 {
1894         if (object == VM_OBJECT_NULL)
1895             return;
1896         size = vm_object_round_page(size);
1897         offset = vm_object_trunc_page(offset);
1898
1899         vm_object_lock(object);
1900
1901         if (object->phys_contiguous) {
1902                 if (pmap != NULL) {
1903                         vm_object_unlock(object);
1904                         pmap_protect(pmap, pmap_start, pmap_start + size, prot);
1905                 } else {
1906                         vm_object_offset_t phys_start, phys_end, phys_addr;
1907
1908                         phys_start = object->shadow_offset + offset;
1909                         phys_end = phys_start + size;
1910                         assert(phys_start <= phys_end);
1911                         assert(phys_end <= object->shadow_offset + object->size);
1912                         vm_object_unlock(object);
1913
1914                         for (phys_addr = phys_start;
1915                              phys_addr < phys_end;
1916                              phys_addr += PAGE_SIZE_64) {
1917                                 pmap_page_protect(phys_addr >> 12, prot);
1918                         }
1919                 }
1920                 return;
1921         }
1922
1923         assert(object->internal);
1924
1925         while (TRUE) {
1926            if (ptoa_64(object->resident_page_count) > size/2 && pmap != PMAP_NULL) {
1927                 vm_object_unlock(object);
1928                 pmap_protect(pmap, pmap_start, pmap_start + size, prot);
1929                 return;
1930             }
1931
1932             /* if we are doing large ranges with respect to resident */
1933             /* page count then we should interate over pages otherwise */
1934             /* inverse page look-up will be faster */
1935             if (ptoa_64(object->resident_page_count / 4) <  size) {
1936                 vm_page_t               p;
1937                 vm_object_offset_t      end;
1938
1939                 end = offset + size;
1940
1941                 if (pmap != PMAP_NULL) {
1942                   queue_iterate(&object->memq, p, vm_page_t, listq) {
1943                     if (!p->fictitious &&
1944                         (offset <= p->offset) && (p->offset < end)) {
1945                         vm_map_offset_t start;
1946
1947                         start = pmap_start + p->offset - offset;
1948                         pmap_protect(pmap, start, start + PAGE_SIZE_64, prot);
1949                     }
1950                   }
1951                 } else {
1952                   queue_iterate(&object->memq, p, vm_page_t, listq) {
1953                     if (!p->fictitious &&
1954                         (offset <= p->offset) && (p->offset < end)) {
1955
1956                         pmap_page_protect(p->phys_page, prot);
1957                     }
1958                   }
1959                 }
1960            } else {
1961                 vm_page_t               p;
1962                 vm_object_offset_t      end;
1963                 vm_object_offset_t      target_off;
1964
1965                 end = offset + size;
1966
1967                 if (pmap != PMAP_NULL) {
1968                         for(target_off = offset;
1969                             target_off < end;
1970                             target_off += PAGE_SIZE) {
1971                                 p = vm_page_lookup(object, target_off);
1972                                 if (p != VM_PAGE_NULL) {
1973                                         vm_offset_t start;
1974                                         start = pmap_start +
1975                                                 (vm_offset_t)(p->offset - offset);
1976                                         pmap_protect(pmap, start,
1977                                                         start + PAGE_SIZE, prot);
1978                                 }
1979                         }
1980                 } else {
1981                         for(target_off = offset;
1982                                 target_off < end; target_off += PAGE_SIZE) {
1983                                 p = vm_page_lookup(object, target_off);
1984                                 if (p != VM_PAGE_NULL) {
1985                                         pmap_page_protect(p->phys_page, prot);
1986                                 }
1987                         }
1988                 }
1989           }
1990
1991             if (prot == VM_PROT_NONE) {
1992                 /*
1993                  * Must follow shadow chain to remove access
1994                  * to pages in shadowed objects.
1995                  */
1996                 register vm_object_t    next_object;
1997
1998                 next_object = object->shadow;
1999                 if (next_object != VM_OBJECT_NULL) {
2000                     offset += object->shadow_offset;
2001                     vm_object_lock(next_object);
2002                     vm_object_unlock(object);
2003                     object = next_object;
2004                 }
2005                 else {
2006                     /*
2007                      * End of chain - we are done.
2008                      */
2009                     break;
2010                 }
2011             }
2012             else {
2013                 /*
2014                  * Pages in shadowed objects may never have
2015                  * write permission - we may stop here.
2016                  */
2017                 break;
2018             }
2019         }
2020
2021         vm_object_unlock(object);
2022 }
2023
2024 /*
2025  *      Routine:        vm_object_copy_slowly
2026  *
2027  *      Description:
2028  *              Copy the specified range of the source
2029  *              virtual memory object without using
2030  *              protection-based optimizations (such
2031  *              as copy-on-write).  The pages in the
2032  *              region are actually copied.
2033  *
2034  *      In/out conditions:
2035  *              The caller must hold a reference and a lock
2036  *              for the source virtual memory object.  The source
2037  *              object will be returned *unlocked*.
2038  *
2039  *      Results:
2040  *              If the copy is completed successfully, KERN_SUCCESS is
2041  *              returned.  If the caller asserted the interruptible
2042  *              argument, and an interruption occurred while waiting
2043  *              for a user-generated event, MACH_SEND_INTERRUPTED is
2044  *              returned.  Other values may be returned to indicate
2045  *              hard errors during the copy operation.
2046  *
2047  *              A new virtual memory object is returned in a
2048  *              parameter (_result_object).  The contents of this
2049  *              new object, starting at a zero offset, are a copy
2050  *              of the source memory region.  In the event of
2051  *              an error, this parameter will contain the value
2052  *              VM_OBJECT_NULL.
2053  */
2054 __private_extern__ kern_return_t
2055 vm_object_copy_slowly(
2056         register vm_object_t    src_object,
2057         vm_object_offset_t      src_offset,
2058         vm_object_size_t        size,
2059         boolean_t               interruptible,
2060         vm_object_t             *_result_object)        /* OUT */
2061 {
2062         vm_object_t             new_object;
2063         vm_object_offset_t      new_offset;
2064
2065         struct vm_object_fault_info fault_info;
2066
2067         XPR(XPR_VM_OBJECT, "v_o_c_slowly obj 0x%x off 0x%x size 0x%x\n",
2068             src_object, src_offset, size, 0, 0);
2069
2070         if (size == 0) {
2071                 vm_object_unlock(src_object);
2072                 *_result_object = VM_OBJECT_NULL;
2073                 return(KERN_INVALID_ARGUMENT);
2074         }
2075
2076         /*
2077          *      Prevent destruction of the source object while we copy.
2078          */
2079
2080         vm_object_reference_locked(src_object);
2081         vm_object_unlock(src_object);
2082
2083         /*
2084          *      Create a new object to hold the copied pages.
2085          *      A few notes:
2086          *              We fill the new object starting at offset 0,
2087          *               regardless of the input offset.
2088          *              We don't bother to lock the new object within
2089          *               this routine, since we have the only reference.
2090          */
2091
2092         new_object = vm_object_allocate(size);
2093         new_offset = 0;
2094
2095         assert(size == trunc_page_64(size));    /* Will the loop terminate? */
2096
2097         fault_info.interruptible = interruptible;
2098         fault_info.behavior  = VM_BEHAVIOR_SEQUENTIAL;
2099         fault_info.user_tag  = 0;
2100         fault_info.lo_offset = src_offset;
2101         fault_info.hi_offset = src_offset + size;
2102         fault_info.no_cache  = FALSE;
2103
2104         for ( ;
2105             size != 0 ;
2106             src_offset += PAGE_SIZE_64,
2107                         new_offset += PAGE_SIZE_64, size -= PAGE_SIZE_64
2108             ) {
2109                 vm_page_t       new_page;
2110                 vm_fault_return_t result;
2111
2112                 vm_object_lock(new_object);
2113
2114                 while ((new_page = vm_page_alloc(new_object, new_offset))
2115                                 == VM_PAGE_NULL) {
2116
2117                         vm_object_unlock(new_object);
2118
2119                         if (!vm_page_wait(interruptible)) {
2120                                 vm_object_deallocate(new_object);
2121                                 vm_object_deallocate(src_object);
2122                                 *_result_object = VM_OBJECT_NULL;
2123                                 return(MACH_SEND_INTERRUPTED);
2124                         }
2125                         vm_object_lock(new_object);
2126                 }
2127                 vm_object_unlock(new_object);
2128
2129                 do {
2130                         vm_prot_t       prot = VM_PROT_READ;
2131                         vm_page_t       _result_page;
2132                         vm_page_t       top_page;
2133                         register
2134                         vm_page_t       result_page;
2135                         kern_return_t   error_code;
2136
2137                         vm_object_lock(src_object);
2138                         vm_object_paging_begin(src_object);
2139
2140                         fault_info.cluster_size = size;
2141
2142                         XPR(XPR_VM_FAULT,"vm_object_copy_slowly -> vm_fault_page",0,0,0,0,0);
2143                         result = vm_fault_page(src_object, src_offset,
2144                                 VM_PROT_READ, FALSE,
2145                                 &prot, &_result_page, &top_page,
2146                                 (int *)0,
2147                                 &error_code, FALSE, FALSE, &fault_info);
2148
2149                         switch(result) {
2150                                 case VM_FAULT_SUCCESS:
2151                                         result_page = _result_page;
2152
2153                                         /*
2154                                          *      We don't need to hold the object
2155                                          *      lock -- the busy page will be enough.
2156                                          *      [We don't care about picking up any
2157                                          *      new modifications.]
2158                                          *
2159                                          *      Copy the page to the new object.
2160                                          *
2161                                          *      POLICY DECISION:
2162                                          *              If result_page is clean,
2163                                          *              we could steal it instead
2164                                          *              of copying.
2165                                          */
2166
2167                                         vm_object_unlock(result_page->object);
2168                                         vm_page_copy(result_page, new_page);
2169
2170                                         /*
2171                                          *      Let go of both pages (make them
2172                                          *      not busy, perform wakeup, activate).
2173                                          */
2174                                         vm_object_lock(new_object);
2175                                         new_page->dirty = TRUE;
2176                                         PAGE_WAKEUP_DONE(new_page);
2177                                         vm_object_unlock(new_object);
2178
2179                                         vm_object_lock(result_page->object);
2180                                         PAGE_WAKEUP_DONE(result_page);
2181
2182                                         vm_page_lockspin_queues();
2183                                         if (!result_page->active &&
2184                                             !result_page->inactive &&
2185                                             !result_page->throttled)
2186                                                 vm_page_activate(result_page);
2187                                         vm_page_activate(new_page);
2188                                         vm_page_unlock_queues();
2189
2190                                         /*
2191                                          *      Release paging references and
2192                                          *      top-level placeholder page, if any.
2193                                          */
2194
2195                                         vm_fault_cleanup(result_page->object,
2196                                                         top_page);
2197
2198                                         break;
2199
2200                                 case VM_FAULT_RETRY:
2201                                         break;
2202
2203                                 case VM_FAULT_FICTITIOUS_SHORTAGE:
2204                                         vm_page_more_fictitious();
2205                                         break;
2206
2207                                 case VM_FAULT_MEMORY_SHORTAGE:
2208                                         if (vm_page_wait(interruptible))
2209                                                 break;
2210                                         /* fall thru */
2211
2212                                 case VM_FAULT_INTERRUPTED:
2213                                         vm_page_free(new_page);
2214                                         vm_object_deallocate(new_object);
2215                                         vm_object_deallocate(src_object);
2216                                         *_result_object = VM_OBJECT_NULL;
2217                                         return(MACH_SEND_INTERRUPTED);
2218
2219                                 case VM_FAULT_MEMORY_ERROR:
2220                                         /*
2221                                          * A policy choice:
2222                                          *      (a) ignore pages that we can't
2223                                          *          copy
2224                                          *      (b) return the null object if
2225                                          *          any page fails [chosen]
2226                                          */
2227
2228                                         vm_page_lock_queues();
2229                                         vm_page_free(new_page);
2230                                         vm_page_unlock_queues();
2231
2232                                         vm_object_deallocate(new_object);
2233                                         vm_object_deallocate(src_object);
2234                                         *_result_object = VM_OBJECT_NULL;
2235                                         return(error_code ? error_code:
2236                                                 KERN_MEMORY_ERROR);
2237                         }
2238                 } while (result != VM_FAULT_SUCCESS);
2239         }
2240
2241         /*
2242          *      Lose the extra reference, and return our object.
2243          */
2244         vm_object_deallocate(src_object);
2245         *_result_object = new_object;
2246         return(KERN_SUCCESS);
2247 }
2248
2249 /*
2250  *      Routine:        vm_object_copy_quickly
2251  *
2252  *      Purpose:
2253  *              Copy the specified range of the source virtual
2254  *              memory object, if it can be done without waiting
2255  *              for user-generated events.
2256  *
2257  *      Results:
2258  *              If the copy is successful, the copy is returned in
2259  *              the arguments; otherwise, the arguments are not
2260  *              affected.
2261  *
2262  *      In/out conditions:
2263  *              The object should be unlocked on entry and exit.
2264  */
2265
2266 /*ARGSUSED*/
2267 __private_extern__ boolean_t
2268 vm_object_copy_quickly(
2269         vm_object_t             *_object,               /* INOUT */
2270         __unused vm_object_offset_t     offset, /* IN */
2271         __unused vm_object_size_t       size,   /* IN */
2272         boolean_t               *_src_needs_copy,       /* OUT */
2273         boolean_t               *_dst_needs_copy)       /* OUT */
2274 {
2275         vm_object_t     object = *_object;
2276         memory_object_copy_strategy_t copy_strategy;
2277
2278         XPR(XPR_VM_OBJECT, "v_o_c_quickly obj 0x%x off 0x%x size 0x%x\n",
2279             *_object, offset, size, 0, 0);
2280         if (object == VM_OBJECT_NULL) {
2281                 *_src_needs_copy = FALSE;
2282                 *_dst_needs_copy = FALSE;
2283                 return(TRUE);
2284         }
2285
2286         vm_object_lock(object);
2287
2288         copy_strategy = object->copy_strategy;
2289
2290         switch (copy_strategy) {
2291         case MEMORY_OBJECT_COPY_SYMMETRIC:
2292
2293                 /*
2294                  *      Symmetric copy strategy.
2295                  *      Make another reference to the object.
2296                  *      Leave object/offset unchanged.
2297                  */
2298
2299                 vm_object_reference_locked(object);
2300                 object->shadowed = TRUE;
2301                 vm_object_unlock(object);
2302
2303                 /*
2304                  *      Both source and destination must make
2305                  *      shadows, and the source must be made
2306                  *      read-only if not already.
2307                  */
2308
2309                 *_src_needs_copy = TRUE;
2310                 *_dst_needs_copy = TRUE;
2311
2312                 break;
2313
2314         case MEMORY_OBJECT_COPY_DELAY:
2315                 vm_object_unlock(object);
2316                 return(FALSE);
2317
2318         default:
2319                 vm_object_unlock(object);
2320                 return(FALSE);
2321         }
2322         return(TRUE);
2323 }
2324
2325 static int copy_call_count = 0;
2326 static int copy_call_sleep_count = 0;
2327 static int copy_call_restart_count = 0;
2328
2329 /*
2330  *      Routine:        vm_object_copy_call [internal]
2331  *
2332  *      Description:
2333  *              Copy the source object (src_object), using the
2334  *              user-managed copy algorithm.
2335  *
2336  *      In/out conditions:
2337  *              The source object must be locked on entry.  It
2338  *              will be *unlocked* on exit.
2339  *
2340  *      Results:
2341  *              If the copy is successful, KERN_SUCCESS is returned.
2342  *              A new object that represents the copied virtual
2343  *              memory is returned in a parameter (*_result_object).
2344  *              If the return value indicates an error, this parameter
2345  *              is not valid.
2346  */
2347 static kern_return_t
2348 vm_object_copy_call(
2349         vm_object_t             src_object,
2350         vm_object_offset_t      src_offset,
2351         vm_object_size_t        size,
2352         vm_object_t             *_result_object)        /* OUT */
2353 {
2354         kern_return_t   kr;
2355         vm_object_t     copy;
2356         boolean_t       check_ready = FALSE;
2357         uint32_t        try_failed_count = 0;
2358
2359         /*
2360          *      If a copy is already in progress, wait and retry.
2361          *
2362          *      XXX
2363          *      Consider making this call interruptable, as Mike
2364          *      intended it to be.
2365          *
2366          *      XXXO
2367          *      Need a counter or version or something to allow
2368          *      us to use the copy that the currently requesting
2369          *      thread is obtaining -- is it worth adding to the
2370          *      vm object structure? Depends how common this case it.
2371          */
2372         copy_call_count++;
2373         while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) {
2374                 vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL,
2375                                THREAD_UNINT);
2376                 copy_call_restart_count++;
2377         }
2378
2379         /*
2380          *      Indicate (for the benefit of memory_object_create_copy)
2381          *      that we want a copy for src_object. (Note that we cannot
2382          *      do a real assert_wait before calling memory_object_copy,
2383          *      so we simply set the flag.)
2384          */
2385
2386         vm_object_set_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL);
2387         vm_object_unlock(src_object);
2388
2389         /*
2390          *      Ask the memory manager to give us a memory object
2391          *      which represents a copy of the src object.
2392          *      The memory manager may give us a memory object
2393          *      which we already have, or it may give us a
2394          *      new memory object. This memory object will arrive
2395          *      via memory_object_create_copy.
2396          */
2397
2398         kr = KERN_FAILURE;      /* XXX need to change memory_object.defs */
2399         if (kr != KERN_SUCCESS) {
2400                 return kr;
2401         }
2402
2403         /*
2404          *      Wait for the copy to arrive.
2405          */
2406         vm_object_lock(src_object);
2407         while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) {
2408                 vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL,
2409                                THREAD_UNINT);
2410                 copy_call_sleep_count++;
2411         }
2412 Retry:
2413         assert(src_object->copy != VM_OBJECT_NULL);
2414         copy = src_object->copy;
2415         if (!vm_object_lock_try(copy)) {
2416                 vm_object_unlock(src_object);
2417
2418                 try_failed_count++;
2419                 mutex_pause(try_failed_count);  /* wait a bit */
2420
2421                 vm_object_lock(src_object);
2422                 goto Retry;
2423         }
2424         if (copy->size < src_offset+size)
2425                 copy->size = src_offset+size;
2426
2427         if (!copy->pager_ready)
2428                 check_ready = TRUE;
2429
2430         /*
2431          *      Return the copy.
2432          */
2433         *_result_object = copy;
2434         vm_object_unlock(copy);
2435         vm_object_unlock(src_object);
2436
2437         /* Wait for the copy to be ready. */
2438         if (check_ready == TRUE) {
2439                 vm_object_lock(copy);
2440                 while (!copy->pager_ready) {
2441                         vm_object_sleep(copy, VM_OBJECT_EVENT_PAGER_READY, THREAD_UNINT);
2442                 }
2443                 vm_object_unlock(copy);
2444         }
2445
2446         return KERN_SUCCESS;
2447 }
2448
2449 static int copy_delayed_lock_collisions = 0;
2450 static int copy_delayed_max_collisions = 0;
2451 static int copy_delayed_lock_contention = 0;
2452 static int copy_delayed_protect_iterate = 0;
2453
2454 /*
2455  *      Routine:        vm_object_copy_delayed [internal]
2456  *
2457  *      Description:
2458  *              Copy the specified virtual memory object, using
2459  *              the asymmetric copy-on-write algorithm.
2460  *
2461  *      In/out conditions:
2462  *              The src_object must be locked on entry.  It will be unlocked
2463  *              on exit - so the caller must also hold a reference to it.
2464  *
2465  *              This routine will not block waiting for user-generated
2466  *              events.  It is not interruptible.
2467  */
2468 __private_extern__ vm_object_t
2469 vm_object_copy_delayed(
2470         vm_object_t             src_object,
2471         vm_object_offset_t      src_offset,
2472         vm_object_size_t        size,
2473         boolean_t               src_object_shared)
2474 {
2475         vm_object_t             new_copy = VM_OBJECT_NULL;
2476         vm_object_t             old_copy;
2477         vm_page_t               p;
2478         vm_object_size_t        copy_size = src_offset + size;
2479
2480
2481         int collisions = 0;
2482         /*
2483          *      The user-level memory manager wants to see all of the changes
2484          *      to this object, but it has promised not to make any changes on
2485          *      its own.
2486          *
2487          *      Perform an asymmetric copy-on-write, as follows:
2488          *              Create a new object, called a "copy object" to hold
2489          *               pages modified by the new mapping  (i.e., the copy,
2490          *               not the original mapping).
2491          *              Record the original object as the backing object for
2492          *               the copy object.  If the original mapping does not
2493          *               change a page, it may be used read-only by the copy.
2494          *              Record the copy object in the original object.
2495          *               When the original mapping causes a page to be modified,
2496          *               it must be copied to a new page that is "pushed" to
2497          *               the copy object.
2498          *              Mark the new mapping (the copy object) copy-on-write.
2499          *               This makes the copy object itself read-only, allowing
2500          *               it to be reused if the original mapping makes no
2501          *               changes, and simplifying the synchronization required
2502          *               in the "push" operation described above.
2503          *
2504          *      The copy-on-write is said to be assymetric because the original
2505          *      object is *not* marked copy-on-write. A copied page is pushed
2506          *      to the copy object, regardless which party attempted to modify
2507          *      the page.
2508          *
2509          *      Repeated asymmetric copy operations may be done. If the
2510          *      original object has not been changed since the last copy, its
2511          *      copy object can be reused. Otherwise, a new copy object can be
2512          *      inserted between the original object and its previous copy
2513          *      object.  Since any copy object is read-only, this cannot affect
2514          *      affect the contents of the previous copy object.
2515          *
2516          *      Note that a copy object is higher in the object tree than the
2517          *      original object; therefore, use of the copy object recorded in
2518          *      the original object must be done carefully, to avoid deadlock.
2519          */
2520
2521  Retry:
2522
2523         /*
2524          * Wait for paging in progress.
2525          */
2526         if (!src_object->true_share && src_object->paging_in_progress) {
2527                 if (src_object_shared == TRUE) {
2528                         vm_object_unlock(src_object);
2529
2530                         vm_object_lock(src_object);
2531                         src_object_shared = FALSE;
2532                 }
2533                 vm_object_paging_wait(src_object, THREAD_UNINT);
2534         }
2535         /*
2536          *      See whether we can reuse the result of a previous
2537          *      copy operation.
2538          */
2539
2540         old_copy = src_object->copy;
2541         if (old_copy != VM_OBJECT_NULL) {
2542                 int lock_granted;
2543
2544                 /*
2545                  *      Try to get the locks (out of order)
2546                  */
2547                 if (src_object_shared == TRUE)
2548                         lock_granted = vm_object_lock_try_shared(old_copy);
2549                 else
2550                         lock_granted = vm_object_lock_try(old_copy);
2551
2552                 if (!lock_granted) {
2553                         vm_object_unlock(src_object);
2554
2555                         if (collisions++ == 0)
2556                                 copy_delayed_lock_contention++;
2557                         mutex_pause(collisions);
2558
2559                         /* Heisenberg Rules */
2560                         copy_delayed_lock_collisions++;
2561
2562                         if (collisions > copy_delayed_max_collisions)
2563                                 copy_delayed_max_collisions = collisions;
2564
2565                         if (src_object_shared == TRUE)
2566                                 vm_object_lock_shared(src_object);
2567                         else
2568                                 vm_object_lock(src_object);
2569
2570                         goto Retry;
2571                 }
2572
2573                 /*
2574                  *      Determine whether the old copy object has
2575                  *      been modified.
2576                  */
2577
2578                 if (old_copy->resident_page_count == 0 &&
2579                     !old_copy->pager_created) {
2580                         /*
2581                          *      It has not been modified.
2582                          *
2583                          *      Return another reference to
2584                          *      the existing copy-object if
2585                          *      we can safely grow it (if
2586                          *      needed).
2587                          */
2588
2589                         if (old_copy->size < copy_size) {
2590                                 if (src_object_shared == TRUE) {
2591                                         vm_object_unlock(old_copy);
2592                                         vm_object_unlock(src_object);
2593
2594                                         vm_object_lock(src_object);
2595                                         src_object_shared = FALSE;
2596                                         goto Retry;
2597                                 }
2598                                 /*
2599                                  * We can't perform a delayed copy if any of the
2600                                  * pages in the extended range are wired (because
2601                                  * we can't safely take write permission away from
2602                                  * wired pages).  If the pages aren't wired, then
2603                                  * go ahead and protect them.
2604                                  */
2605                                 copy_delayed_protect_iterate++;
2606
2607                                 queue_iterate(&src_object->memq, p, vm_page_t, listq) {
2608                                         if (!p->fictitious &&
2609                                             p->offset >= old_copy->size &&
2610                                             p->offset < copy_size) {
2611                                                 if (p->wire_count > 0) {
2612                                                         vm_object_unlock(old_copy);
2613                                                         vm_object_unlock(src_object);
2614
2615                                                         if (new_copy != VM_OBJECT_NULL) {
2616                                                                 vm_object_unlock(new_copy);
2617                                                                 vm_object_deallocate(new_copy);
2618                                                         }
2619
2620                                                         return VM_OBJECT_NULL;
2621                                                 } else {
2622                                                         pmap_page_protect(p->phys_page,
2623                                                                           (VM_PROT_ALL & ~VM_PROT_WRITE));
2624                                                 }
2625                                         }
2626                                 }
2627                                 old_copy->size = copy_size;
2628                         }
2629                         if (src_object_shared == TRUE)
2630                                 vm_object_reference_shared(old_copy);
2631                         else
2632                                 vm_object_reference_locked(old_copy);
2633                         vm_object_unlock(old_copy);
2634                         vm_object_unlock(src_object);
2635
2636                         if (new_copy != VM_OBJECT_NULL) {
2637                                 vm_object_unlock(new_copy);
2638                                 vm_object_deallocate(new_copy);
2639                         }
2640                         return(old_copy);
2641                 }
2642
2643
2644
2645                 /*
2646                  * Adjust the size argument so that the newly-created
2647                  * copy object will be large enough to back either the
2648                  * old copy object or the new mapping.
2649                  */
2650                 if (old_copy->size > copy_size)
2651                         copy_size = old_copy->size;
2652
2653                 if (new_copy == VM_OBJECT_NULL) {
2654                         vm_object_unlock(old_copy);
2655                         vm_object_unlock(src_object);
2656                         new_copy = vm_object_allocate(copy_size);
2657                         vm_object_lock(src_object);
2658                         vm_object_lock(new_copy);
2659
2660                         src_object_shared = FALSE;
2661                         goto Retry;
2662                 }
2663                 new_copy->size = copy_size;
2664
2665                 /*
2666                  *      The copy-object is always made large enough to
2667                  *      completely shadow the original object, since
2668                  *      it may have several users who want to shadow
2669                  *      the original object at different points.
2670                  */
2671
2672                 assert((old_copy->shadow == src_object) &&
2673                     (old_copy->shadow_offset == (vm_object_offset_t) 0));
2674
2675         } else if (new_copy == VM_OBJECT_NULL) {
2676                 vm_object_unlock(src_object);
2677                 new_copy = vm_object_allocate(copy_size);
2678                 vm_object_lock(src_object);
2679                 vm_object_lock(new_copy);
2680
2681                 src_object_shared = FALSE;
2682                 goto Retry;
2683         }
2684
2685         /*
2686          * We now have the src object locked, and the new copy object
2687          * allocated and locked (and potentially the old copy locked).
2688          * Before we go any further, make sure we can still perform
2689          * a delayed copy, as the situation may have changed.
2690          *
2691          * Specifically, we can't perform a delayed copy if any of the
2692          * pages in the range are wired (because we can't safely take
2693          * write permission away from wired pages).  If the pages aren't
2694          * wired, then go ahead and protect them.
2695          */
2696         copy_delayed_protect_iterate++;
2697
2698         queue_iterate(&src_object->memq, p, vm_page_t, listq) {
2699                 if (!p->fictitious && p->offset < copy_size) {
2700                         if (p->wire_count > 0) {
2701                                 if (old_copy)
2702                                         vm_object_unlock(old_copy);
2703                                 vm_object_unlock(src_object);
2704                                 vm_object_unlock(new_copy);
2705                                 vm_object_deallocate(new_copy);
2706                                 return VM_OBJECT_NULL;
2707                         } else {
2708                                 pmap_page_protect(p->phys_page,
2709                                                   (VM_PROT_ALL & ~VM_PROT_WRITE));
2710                         }
2711                 }
2712         }
2713         if (old_copy != VM_OBJECT_NULL) {
2714                 /*
2715                  *      Make the old copy-object shadow the new one.
2716                  *      It will receive no more pages from the original
2717                  *      object.
2718                  */
2719
2720                 /* remove ref. from old_copy */
2721                 vm_object_lock_assert_exclusive(src_object);
2722                 src_object->ref_count--;
2723                 assert(src_object->ref_count > 0);
2724                 vm_object_lock_assert_exclusive(old_copy);
2725                 old_copy->shadow = new_copy;
2726                 vm_object_lock_assert_exclusive(new_copy);
2727                 assert(new_copy->ref_count > 0);
2728                 new_copy->ref_count++;          /* for old_copy->shadow ref. */
2729
2730 #if TASK_SWAPPER
2731                 if (old_copy->res_count) {
2732                         VM_OBJ_RES_INCR(new_copy);
2733                         VM_OBJ_RES_DECR(src_object);
2734                 }
2735 #endif
2736
2737                 vm_object_unlock(old_copy);     /* done with old_copy */
2738         }
2739
2740         /*
2741          *      Point the new copy at the existing object.
2742          */
2743         vm_object_lock_assert_exclusive(new_copy);
2744         new_copy->shadow = src_object;
2745         new_copy->shadow_offset = 0;
2746         new_copy->shadowed = TRUE;      /* caller must set needs_copy */
2747
2748         vm_object_lock_assert_exclusive(src_object);
2749         vm_object_reference_locked(src_object);
2750         src_object->copy = new_copy;
2751         vm_object_unlock(src_object);
2752         vm_object_unlock(new_copy);
2753
2754         XPR(XPR_VM_OBJECT,
2755                 "vm_object_copy_delayed: used copy object %X for source %X\n",
2756                 (integer_t)new_copy, (integer_t)src_object, 0, 0, 0);
2757
2758         return new_copy;
2759 }
2760
2761 /*
2762  *      Routine:        vm_object_copy_strategically
2763  *
2764  *      Purpose:
2765  *              Perform a copy according to the source object's
2766  *              declared strategy.  This operation may block,
2767  *              and may be interrupted.
2768  */
2769 __private_extern__ kern_return_t
2770 vm_object_copy_strategically(
2771         register vm_object_t    src_object,
2772         vm_object_offset_t      src_offset,
2773         vm_object_size_t        size,
2774         vm_object_t             *dst_object,    /* OUT */
2775         vm_object_offset_t      *dst_offset,    /* OUT */
2776         boolean_t               *dst_needs_copy) /* OUT */
2777 {
2778         boolean_t       result;
2779         boolean_t       interruptible = THREAD_ABORTSAFE; /* XXX */
2780         boolean_t       object_lock_shared = FALSE;
2781         memory_object_copy_strategy_t copy_strategy;
2782
2783         assert(src_object != VM_OBJECT_NULL);
2784
2785         copy_strategy = src_object->copy_strategy;
2786
2787         if (copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
2788                 vm_object_lock_shared(src_object);
2789                 object_lock_shared = TRUE;
2790         } else
2791                 vm_object_lock(src_object);
2792
2793         /*
2794          *      The copy strategy is only valid if the memory manager
2795          *      is "ready". Internal objects are always ready.
2796          */
2797
2798         while (!src_object->internal && !src_object->pager_ready) {
2799                 wait_result_t wait_result;
2800
2801                 if (object_lock_shared == TRUE) {
2802                         vm_object_unlock(src_object);
2803                         vm_object_lock(src_object);
2804                         object_lock_shared = FALSE;
2805                         continue;
2806                 }
2807                 wait_result = vm_object_sleep(  src_object,
2808                                                 VM_OBJECT_EVENT_PAGER_READY,
2809                                                 interruptible);
2810                 if (wait_result != THREAD_AWAKENED) {
2811                         vm_object_unlock(src_object);
2812                         *dst_object = VM_OBJECT_NULL;
2813                         *dst_offset = 0;
2814                         *dst_needs_copy = FALSE;
2815                         return(MACH_SEND_INTERRUPTED);
2816                 }
2817         }
2818
2819         /*
2820          *      Use the appropriate copy strategy.
2821          */
2822
2823         switch (copy_strategy) {
2824             case MEMORY_OBJECT_COPY_DELAY:
2825                 *dst_object = vm_object_copy_delayed(src_object,
2826                                                      src_offset, size, object_lock_shared);
2827                 if (*dst_object != VM_OBJECT_NULL) {
2828                         *dst_offset = src_offset;
2829                         *dst_needs_copy = TRUE;
2830                         result = KERN_SUCCESS;
2831                         break;
2832                 }
2833                 vm_object_lock(src_object);
2834                 /* fall thru when delayed copy not allowed */
2835
2836             case MEMORY_OBJECT_COPY_NONE:
2837                 result = vm_object_copy_slowly(src_object, src_offset, size,
2838                                                interruptible, dst_object);
2839                 if (result == KERN_SUCCESS) {
2840                         *dst_offset = 0;
2841                         *dst_needs_copy = FALSE;
2842                 }
2843                 break;
2844
2845             case MEMORY_OBJECT_COPY_CALL:
2846                 result = vm_object_copy_call(src_object, src_offset, size,
2847                                 dst_object);
2848                 if (result == KERN_SUCCESS) {
2849                         *dst_offset = src_offset;
2850                         *dst_needs_copy = TRUE;
2851                 }
2852                 break;
2853
2854             case MEMORY_OBJECT_COPY_SYMMETRIC:
2855                 XPR(XPR_VM_OBJECT, "v_o_c_strategically obj 0x%x off 0x%x size 0x%x\n",(natural_t)src_object, src_offset, size, 0, 0);
2856                 vm_object_unlock(src_object);
2857                 result = KERN_MEMORY_RESTART_COPY;
2858                 break;
2859
2860             default:
2861                 panic("copy_strategically: bad strategy");
2862                 result = KERN_INVALID_ARGUMENT;
2863         }
2864         return(result);
2865 }
2866
2867 /*
2868  *      vm_object_shadow:
2869  *
2870  *      Create a new object which is backed by the
2871  *      specified existing object range.  The source
2872  *      object reference is deallocated.
2873  *
2874  *      The new object and offset into that object
2875  *      are returned in the source parameters.
2876  */
2877 boolean_t vm_object_shadow_check = FALSE;
2878
2879 __private_extern__ boolean_t
2880 vm_object_shadow(
2881         vm_object_t             *object,        /* IN/OUT */
2882         vm_object_offset_t      *offset,        /* IN/OUT */
2883         vm_object_size_t        length)
2884 {
2885         register vm_object_t    source;
2886         register vm_object_t    result;
2887
2888         source = *object;
2889 #if 0
2890         /*
2891          * XXX FBDP
2892          * This assertion is valid but it gets triggered by Rosetta for example
2893          * due to a combination of vm_remap() that changes a VM object's
2894          * copy_strategy from SYMMETRIC to DELAY and vm_protect(VM_PROT_COPY)
2895          * that then sets "needs_copy" on its map entry.  This creates a
2896          * mapping situation that VM should never see and doesn't know how to
2897          * handle.
2898          * It's not clear if this can create any real problem but we should
2899          * look into fixing this, probably by having vm_protect(VM_PROT_COPY)
2900          * do more than just set "needs_copy" to handle the copy-on-write...
2901          * In the meantime, let's disable the assertion.
2902          */
2903         assert(source->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
2904 #endif
2905
2906         /*
2907          *      Determine if we really need a shadow.
2908          */
2909
2910         if (vm_object_shadow_check && source->ref_count == 1 &&
2911             (source->shadow == VM_OBJECT_NULL ||
2912              source->shadow->copy == VM_OBJECT_NULL))
2913         {
2914                 source->shadowed = FALSE;
2915                 return FALSE;
2916         }
2917
2918         /*
2919          *      Allocate a new object with the given length
2920          */
2921
2922         if ((result = vm_object_allocate(length)) == VM_OBJECT_NULL)
2923                 panic("vm_object_shadow: no object for shadowing");
2924
2925         /*
2926          *      The new object shadows the source object, adding
2927          *      a reference to it.  Our caller changes his reference
2928          *      to point to the new object, removing a reference to
2929          *      the source object.  Net result: no change of reference
2930          *      count.
2931          */
2932         result->shadow = source;
2933
2934         /*
2935          *      Store the offset into the source object,
2936          *      and fix up the offset into the new object.
2937          */
2938
2939         result->shadow_offset = *offset;
2940
2941         /*
2942          *      Return the new things
2943          */
2944
2945         *offset = 0;
2946         *object = result;
2947         return TRUE;
2948 }
2949
2950 /*
2951  *      The relationship between vm_object structures and
2952  *      the memory_object requires careful synchronization.
2953  *
2954  *      All associations are created by memory_object_create_named
2955  *  for external pagers and vm_object_pager_create for internal
2956  *  objects as follows:
2957  *
2958  *              pager:  the memory_object itself, supplied by
2959  *                      the user requesting a mapping (or the kernel,
2960  *                      when initializing internal objects); the
2961  *                      kernel simulates holding send rights by keeping
2962  *                      a port reference;
2963  *
2964  *              pager_request:
2965  *                      the memory object control port,
2966  *                      created by the kernel; the kernel holds
2967  *                      receive (and ownership) rights to this
2968  *                      port, but no other references.
2969  *
2970  *      When initialization is complete, the "initialized" field
2971  *      is asserted.  Other mappings using a particular memory object,
2972  *      and any references to the vm_object gained through the
2973  *      port association must wait for this initialization to occur.
2974  *
2975  *      In order to allow the memory manager to set attributes before
2976  *      requests (notably virtual copy operations, but also data or
2977  *      unlock requests) are made, a "ready" attribute is made available.
2978  *      Only the memory manager may affect the value of this attribute.
2979  *      Its value does not affect critical kernel functions, such as
2980  *      internal object initialization or destruction.  [Furthermore,
2981  *      memory objects created by the kernel are assumed to be ready
2982  *      immediately; the default memory manager need not explicitly
2983  *      set the "ready" attribute.]
2984  *
2985  *      [Both the "initialized" and "ready" attribute wait conditions
2986  *      use the "pager" field as the wait event.]
2987  *
2988  *      The port associations can be broken down by any of the
2989  *      following routines:
2990  *              vm_object_terminate:
2991  *                      No references to the vm_object remain, and
2992  *                      the object cannot (or will not) be cached.
2993  *                      This is the normal case, and is done even
2994  *                      though one of the other cases has already been
2995  *                      done.
2996  *              memory_object_destroy:
2997  *                      The memory manager has requested that the
2998  *                      kernel relinquish references to the memory
2999  *                      object. [The memory manager may not want to
3000  *                      destroy the memory object, but may wish to
3001  *                      refuse or tear down existing memory mappings.]
3002  *
3003  *      Each routine that breaks an association must break all of
3004  *      them at once.  At some later time, that routine must clear
3005  *      the pager field and release the memory object references.
3006  *      [Furthermore, each routine must cope with the simultaneous
3007  *      or previous operations of the others.]
3008  *
3009  *      In addition to the lock on the object, the vm_object_cache_lock
3010  *      governs the associations.  References gained through the
3011  *      association require use of the cache lock.
3012  *
3013  *      Because the pager field may be cleared spontaneously, it
3014  *      cannot be used to determine whether a memory object has
3015  *      ever been associated with a particular vm_object.  [This
3016  *      knowledge is important to the shadow object mechanism.]
3017  *      For this reason, an additional "created" attribute is
3018  *      provided.
3019  *
3020  *      During various paging operations, the pager reference found in the
3021  *      vm_object must be valid.  To prevent this from being released,
3022  *      (other than being removed, i.e., made null), routines may use
3023  *      the vm_object_paging_begin/end routines [actually, macros].
3024  *      The implementation uses the "paging_in_progress" and "wanted" fields.
3025  *      [Operations that alter the validity of the pager values include the
3026  *      termination routines and vm_object_collapse.]
3027  */
3028
3029
3030 /*
3031  *      Routine:        vm_object_enter
3032  *      Purpose:
3033  *              Find a VM object corresponding to the given
3034  *              pager; if no such object exists, create one,
3035  *              and initialize the pager.
3036  */
3037 vm_object_t
3038 vm_object_enter(
3039         memory_object_t         pager,
3040         vm_object_size_t        size,
3041         boolean_t               internal,
3042         boolean_t               init,
3043         boolean_t               named)
3044 {
3045         register vm_object_t    object;
3046         vm_object_t             new_object;
3047         boolean_t               must_init;
3048         vm_object_hash_entry_t  entry, new_entry;
3049         uint32_t        try_failed_count = 0;
3050
3051         if (pager == MEMORY_OBJECT_NULL)
3052                 return(vm_object_allocate(size));
3053
3054         new_object = VM_OBJECT_NULL;
3055         new_entry = VM_OBJECT_HASH_ENTRY_NULL;
3056         must_init = init;
3057
3058         /*
3059          *      Look for an object associated with this port.
3060          */
3061 Retry:
3062         vm_object_cache_lock();
3063         do {
3064                 entry = vm_object_hash_lookup(pager, FALSE);
3065
3066                 if (entry == VM_OBJECT_HASH_ENTRY_NULL) {
3067                         if (new_object == VM_OBJECT_NULL) {
3068                                 /*
3069                                  *      We must unlock to create a new object;
3070                                  *      if we do so, we must try the lookup again.
3071                                  */
3072                                 vm_object_cache_unlock();
3073                                 assert(new_entry == VM_OBJECT_HASH_ENTRY_NULL);
3074                                 new_entry = vm_object_hash_entry_alloc(pager);
3075                                 new_object = vm_object_allocate(size);
3076                                 vm_object_cache_lock();
3077                         } else {
3078                                 /*
3079                                  *      Lookup failed twice, and we have something
3080                                  *      to insert; set the object.
3081                                  */
3082                                 vm_object_hash_insert(new_entry);
3083                                 entry = new_entry;
3084                                 entry->object = new_object;
3085                                 new_entry = VM_OBJECT_HASH_ENTRY_NULL;
3086                                 new_object = VM_OBJECT_NULL;
3087                                 must_init = TRUE;
3088                         }
3089                 } else if (entry->object == VM_OBJECT_NULL) {
3090                         /*
3091                          *      If a previous object is being terminated,
3092                          *      we must wait for the termination message
3093                          *      to be queued (and lookup the entry again).
3094                          */
3095                         entry->waiting = TRUE;
3096                         entry = VM_OBJECT_HASH_ENTRY_NULL;
3097                         assert_wait((event_t) pager, THREAD_UNINT);
3098                         vm_object_cache_unlock();
3099                         thread_block(THREAD_CONTINUE_NULL);
3100                         vm_object_cache_lock();
3101                 }
3102         } while (entry == VM_OBJECT_HASH_ENTRY_NULL);
3103
3104         object = entry->object;
3105         assert(object != VM_OBJECT_NULL);
3106
3107         if (!must_init) {
3108                 if (!vm_object_lock_try(object)) {
3109
3110                         vm_object_cache_unlock();
3111
3112                         try_failed_count++;
3113                         mutex_pause(try_failed_count);  /* wait a bit */
3114
3115                         goto Retry;
3116                 }
3117                 assert(!internal || object->internal);
3118                 if (named) {
3119                         assert(!object->named);
3120                         object->named = TRUE;
3121                 }
3122                 if (object->ref_count == 0) {
3123                         XPR(XPR_VM_OBJECT_CACHE,
3124                     "vm_object_enter: removing %x from cache, head (%x, %x)\n",
3125                                 (integer_t)object,
3126                                 (integer_t)vm_object_cached_list.next,
3127                                 (integer_t)vm_object_cached_list.prev, 0,0);
3128                         queue_remove(&vm_object_cached_list, object,
3129                                      vm_object_t, cached_list);
3130                         vm_object_cached_count--;
3131                 }
3132                 vm_object_lock_assert_exclusive(object);
3133                 object->ref_count++;
3134                 vm_object_res_reference(object);
3135                 vm_object_unlock(object);
3136
3137                 VM_STAT_INCR(hits);
3138         }
3139         assert(object->ref_count > 0);
3140
3141         VM_STAT_INCR(lookups);
3142
3143         vm_object_cache_unlock();
3144
3145         XPR(XPR_VM_OBJECT,
3146                 "vm_o_enter: pager 0x%x obj 0x%x must_init %d\n",
3147                 (integer_t)pager, (integer_t)object, must_init, 0, 0);
3148
3149         /*
3150          *      If we raced to create a vm_object but lost, let's
3151          *      throw away ours.
3152          */
3153
3154         if (new_object != VM_OBJECT_NULL)
3155                 vm_object_deallocate(new_object);
3156
3157         if (new_entry != VM_OBJECT_HASH_ENTRY_NULL)
3158                 vm_object_hash_entry_free(new_entry);
3159
3160         if (must_init) {
3161                 memory_object_control_t control;
3162
3163                 /*
3164                  *      Allocate request port.
3165                  */
3166
3167                 control = memory_object_control_allocate(object);
3168                 assert (control != MEMORY_OBJECT_CONTROL_NULL);
3169
3170                 vm_object_lock(object);
3171                 assert(object != kernel_object);
3172
3173                 /*
3174                  *      Copy the reference we were given.
3175                  */
3176
3177                 memory_object_reference(pager);
3178                 object->pager_created = TRUE;
3179                 object->pager = pager;
3180                 object->internal = internal;
3181                 object->pager_trusted = internal;
3182                 if (!internal) {
3183                         /* copy strategy invalid until set by memory manager */
3184                         object->copy_strategy = MEMORY_OBJECT_COPY_INVALID;
3185                 }
3186                 object->pager_control = control;
3187                 object->pager_ready = FALSE;
3188
3189                 vm_object_unlock(object);
3190
3191                 /*
3192                  *      Let the pager know we're using it.
3193                  */
3194
3195                 (void) memory_object_init(pager,
3196                         object->pager_control,
3197                         PAGE_SIZE);
3198
3199                 vm_object_lock(object);
3200                 if (named)
3201                         object->named = TRUE;
3202                 if (internal) {
3203                         object->pager_ready = TRUE;
3204                         vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY);
3205                 }
3206
3207                 object->pager_initialized = TRUE;
3208                 vm_object_wakeup(object, VM_OBJECT_EVENT_INITIALIZED);
3209         } else {
3210                 vm_object_lock(object);
3211         }
3212
3213         /*
3214          *      [At this point, the object must be locked]
3215          */
3216
3217         /*
3218          *      Wait for the work above to be done by the first
3219          *      thread to map this object.
3220          */
3221
3222         while (!object->pager_initialized) {
3223                 vm_object_sleep(object,
3224                                 VM_OBJECT_EVENT_INITIALIZED,
3225                                 THREAD_UNINT);
3226         }
3227         vm_object_unlock(object);
3228
3229         XPR(XPR_VM_OBJECT,
3230             "vm_object_enter: vm_object %x, memory_object %x, internal %d\n",
3231             (integer_t)object, (integer_t)object->pager, internal, 0,0);
3232         return(object);
3233 }
3234
3235 /*
3236  *      Routine:        vm_object_pager_create
3237  *      Purpose:
3238  *              Create a memory object for an internal object.
3239  *      In/out conditions:
3240  *              The object is locked on entry and exit;
3241  *              it may be unlocked within this call.
3242  *      Limitations:
3243  *              Only one thread may be performing a
3244  *              vm_object_pager_create on an object at
3245  *              a time.  Presumably, only the pageout
3246  *              daemon will be using this routine.
3247  */
3248
3249 void
3250 vm_object_pager_create(
3251         register vm_object_t    object)
3252 {
3253         memory_object_t         pager;
3254         vm_object_hash_entry_t  entry;
3255 #if     MACH_PAGEMAP
3256         vm_object_size_t        size;
3257         vm_external_map_t       map;
3258 #endif  /* MACH_PAGEMAP */
3259
3260         XPR(XPR_VM_OBJECT, "vm_object_pager_create, object 0x%X\n",
3261                 (integer_t)object, 0,0,0,0);
3262
3263         assert(object != kernel_object);
3264
3265         if (memory_manager_default_check() != KERN_SUCCESS)
3266                 return;
3267
3268         /*
3269          *      Prevent collapse or termination by holding a paging reference
3270          */
3271
3272         vm_object_paging_begin(object);
3273         if (object->pager_created) {
3274                 /*
3275                  *      Someone else got to it first...
3276                  *      wait for them to finish initializing the ports
3277                  */
3278                 while (!object->pager_initialized) {
3279                         vm_object_sleep(object,
3280                                         VM_OBJECT_EVENT_INITIALIZED,
3281                                         THREAD_UNINT);
3282                 }
3283                 vm_object_paging_end(object);
3284                 return;
3285         }
3286
3287         /*
3288          *      Indicate that a memory object has been assigned
3289          *      before dropping the lock, to prevent a race.
3290          */
3291
3292         object->pager_created = TRUE;
3293         object->paging_offset = 0;
3294
3295 #if     MACH_PAGEMAP
3296         size = object->size;
3297 #endif  /* MACH_PAGEMAP */
3298         vm_object_unlock(object);
3299
3300 #if     MACH_PAGEMAP
3301         map = vm_external_create(size);
3302         vm_object_lock(object);
3303         assert(object->size == size);
3304         object->existence_map = map;
3305         vm_object_unlock(object);
3306 #endif  /* MACH_PAGEMAP */
3307
3308         /*
3309          *      Create the [internal] pager, and associate it with this object.
3310          *
3311          *      We make the association here so that vm_object_enter()
3312          *      can look up the object to complete initializing it.  No
3313          *      user will ever map this object.
3314          */
3315         {
3316                 memory_object_default_t         dmm;
3317
3318                 /* acquire a reference for the default memory manager */
3319                 dmm = memory_manager_default_reference();
3320
3321                 assert(object->temporary);
3322
3323                 /* create our new memory object */
3324                 (void) memory_object_create(dmm, object->size, &pager);
3325
3326                 memory_object_default_deallocate(dmm);
3327        }
3328
3329         entry = vm_object_hash_entry_alloc(pager);
3330
3331         vm_object_cache_lock();
3332         vm_object_hash_insert(entry);
3333
3334         entry->object = object;
3335         vm_object_cache_unlock();
3336
3337         /*
3338          *      A reference was returned by
3339          *      memory_object_create(), and it is
3340          *      copied by vm_object_enter().
3341          */
3342
3343         if (vm_object_enter(pager, object->size, TRUE, TRUE, FALSE) != object)
3344                 panic("vm_object_pager_create: mismatch");
3345
3346         /*
3347          *      Drop the reference we were passed.
3348          */
3349         memory_object_deallocate(pager);
3350
3351         vm_object_lock(object);
3352
3353         /*
3354          *      Release the paging reference
3355          */
3356         vm_object_paging_end(object);
3357 }
3358
3359 /*
3360  *      Routine:        vm_object_remove
3361  *      Purpose:
3362  *              Eliminate the pager/object association
3363  *              for this pager.
3364  *      Conditions:
3365  *              The object cache must be locked.
3366  */
3367 __private_extern__ void
3368 vm_object_remove(
3369         vm_object_t     object)
3370 {
3371         memory_object_t pager;
3372
3373         if ((pager = object->pager) != MEMORY_OBJECT_NULL) {
3374                 vm_object_hash_entry_t  entry;
3375
3376                 entry = vm_object_hash_lookup(pager, FALSE);
3377                 if (entry != VM_OBJECT_HASH_ENTRY_NULL)
3378                         entry->object = VM_OBJECT_NULL;
3379         }
3380
3381 }
3382
3383 /*
3384  *      Global variables for vm_object_collapse():
3385  *
3386  *              Counts for normal collapses and bypasses.
3387  *              Debugging variables, to watch or disable collapse.
3388  */
3389 static long     object_collapses = 0;
3390 static long     object_bypasses  = 0;
3391
3392 static boolean_t        vm_object_collapse_allowed = TRUE;
3393 static boolean_t        vm_object_bypass_allowed = TRUE;
3394
3395 #if MACH_PAGEMAP
3396 static int      vm_external_discarded;
3397 static int      vm_external_collapsed;
3398 #endif
3399
3400 unsigned long vm_object_collapse_encrypted = 0;
3401
3402 /*
3403  *      Routine:        vm_object_do_collapse
3404  *      Purpose:
3405  *              Collapse an object with the object backing it.
3406  *              Pages in the backing object are moved into the
3407  *              parent, and the backing object is deallocated.
3408  *      Conditions:
3409  *              Both objects and the cache are locked; the page
3410  *              queues are unlocked.
3411  *
3412  */
3413 static void
3414 vm_object_do_collapse(
3415         vm_object_t object,
3416         vm_object_t backing_object)
3417 {
3418         vm_page_t p, pp;
3419         vm_object_offset_t new_offset, backing_offset;
3420         vm_object_size_t size;
3421
3422         backing_offset = object->shadow_offset;
3423         size = object->size;
3424
3425         /*
3426          *      Move all in-memory pages from backing_object
3427          *      to the parent.  Pages that have been paged out
3428          *      will be overwritten by any of the parent's
3429          *      pages that shadow them.
3430          */
3431
3432         while (!queue_empty(&backing_object->memq)) {
3433
3434                 p = (vm_page_t) queue_first(&backing_object->memq);
3435
3436                 new_offset = (p->offset - backing_offset);
3437
3438                 assert(!p->busy || p->absent);
3439
3440                 /*
3441                  *      If the parent has a page here, or if
3442                  *      this page falls outside the parent,
3443                  *      dispose of it.
3444                  *
3445                  *      Otherwise, move it as planned.
3446                  */
3447
3448                 if (p->offset < backing_offset || new_offset >= size) {
3449                         VM_PAGE_FREE(p);
3450                 } else {
3451                         /*
3452                          * ENCRYPTED SWAP:
3453                          * The encryption key includes the "pager" and the
3454                          * "paging_offset".  These will not change during the
3455                          * object collapse, so we can just move an encrypted
3456                          * page from one object to the other in this case.
3457                          * We can't decrypt the page here, since we can't drop
3458                          * the object lock.
3459                          */
3460                         if (p->encrypted) {
3461                                 vm_object_collapse_encrypted++;
3462                         }
3463                         pp = vm_page_lookup(object, new_offset);
3464                         if (pp == VM_PAGE_NULL) {
3465
3466                                 /*
3467                                  *      Parent now has no page.
3468                                  *      Move the backing object's page up.
3469                                  */
3470
3471                                 vm_page_rename(p, object, new_offset, TRUE);
3472 #if     MACH_PAGEMAP
3473                         } else if (pp->absent) {
3474
3475                                 /*
3476                                  *      Parent has an absent page...
3477                                  *      it's not being paged in, so
3478                                  *      it must really be missing from
3479                                  *      the parent.
3480                                  *
3481                                  *      Throw out the absent page...
3482                                  *      any faults looking for that
3483                                  *      page will restart with the new
3484                                  *      one.
3485                                  */
3486
3487                                 VM_PAGE_FREE(pp);
3488                                 vm_page_rename(p, object, new_offset, TRUE);
3489 #endif  /* MACH_PAGEMAP */
3490                         } else {
3491                                 assert(! pp->absent);
3492
3493                                 /*
3494                                  *      Parent object has a real page.
3495                                  *      Throw away the backing object's
3496                                  *      page.
3497                                  */
3498                                 VM_PAGE_FREE(p);
3499                         }
3500                 }
3501         }
3502
3503 #if     !MACH_PAGEMAP
3504         assert((!object->pager_created && (object->pager == MEMORY_OBJECT_NULL))
3505                 || (!backing_object->pager_created
3506                 &&  (backing_object->pager == MEMORY_OBJECT_NULL)));
3507 #else
3508         assert(!object->pager_created && object->pager == MEMORY_OBJECT_NULL);
3509 #endif  /* !MACH_PAGEMAP */
3510
3511         if (backing_object->pager != MEMORY_OBJECT_NULL) {
3512                 vm_object_hash_entry_t  entry;
3513
3514                 /*
3515                  *      Move the pager from backing_object to object.
3516                  *
3517                  *      XXX We're only using part of the paging space
3518                  *      for keeps now... we ought to discard the
3519                  *      unused portion.
3520                  */
3521
3522                 assert(!object->paging_in_progress);
3523                 object->pager = backing_object->pager;
3524                 entry = vm_object_hash_lookup(object->pager, FALSE);
3525                 assert(entry != VM_OBJECT_HASH_ENTRY_NULL);
3526                 entry->object = object;
3527                 object->pager_created = backing_object->pager_created;
3528                 object->pager_control = backing_object->pager_control;
3529                 object->pager_ready = backing_object->pager_ready;
3530                 object->pager_initialized = backing_object->pager_initialized;
3531                 object->paging_offset =
3532                     backing_object->paging_offset + backing_offset;
3533                 if (object->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
3534                         memory_object_control_collapse(object->pager_control,
3535                                                        object);
3536                 }
3537         }
3538
3539         vm_object_cache_unlock();
3540
3541 #if     MACH_PAGEMAP
3542         /*
3543          *      If the shadow offset is 0, the use the existence map from
3544          *      the backing object if there is one. If the shadow offset is
3545          *      not zero, toss it.
3546          *
3547          *      XXX - If the shadow offset is not 0 then a bit copy is needed
3548          *      if the map is to be salvaged.  For now, we just just toss the
3549          *      old map, giving the collapsed object no map. This means that
3550          *      the pager is invoked for zero fill pages.  If analysis shows
3551          *      that this happens frequently and is a performance hit, then
3552          *      this code should be fixed to salvage the map.
3553          */
3554         assert(object->existence_map == VM_EXTERNAL_NULL);
3555         if (backing_offset || (size != backing_object->size)) {
3556                 vm_external_discarded++;
3557                 vm_external_destroy(backing_object->existence_map,
3558                         backing_object->size);
3559         }
3560         else {
3561                 vm_external_collapsed++;
3562                 object->existence_map = backing_object->existence_map;
3563         }
3564         backing_object->existence_map = VM_EXTERNAL_NULL;
3565 #endif  /* MACH_PAGEMAP */
3566
3567         /*
3568          *      Object now shadows whatever backing_object did.
3569          *      Note that the reference to backing_object->shadow
3570          *      moves from within backing_object to within object.
3571          */
3572
3573         assert(!object->phys_contiguous);
3574         assert(!backing_object->phys_contiguous);
3575         object->shadow = backing_object->shadow;
3576         if (object->shadow) {
3577                 object->shadow_offset += backing_object->shadow_offset;
3578         } else {
3579                 /* no shadow, therefore no shadow offset... */
3580                 object->shadow_offset = 0;
3581         }
3582         assert((object->shadow == VM_OBJECT_NULL) ||
3583                (object->shadow->copy != backing_object));
3584
3585         /*
3586          *      Discard backing_object.
3587          *
3588          *      Since the backing object has no pages, no
3589          *      pager left, and no object references within it,
3590          *      all that is necessary is to dispose of it.
3591          */
3592
3593         assert((backing_object->ref_count == 1) &&
3594                (backing_object->resident_page_count == 0) &&
3595                (backing_object->paging_in_progress == 0));
3596
3597         backing_object->alive = FALSE;
3598         vm_object_unlock(backing_object);
3599
3600         XPR(XPR_VM_OBJECT, "vm_object_collapse, collapsed 0x%X\n",
3601                 (integer_t)backing_object, 0,0,0,0);
3602
3603         vm_object_lock_destroy(backing_object);
3604
3605         zfree(vm_object_zone, backing_object);
3606
3607         object_collapses++;
3608 }
3609
3610 static void
3611 vm_object_do_bypass(
3612         vm_object_t object,
3613         vm_object_t backing_object)
3614 {
3615         /*
3616          *      Make the parent shadow the next object
3617          *      in the chain.
3618          */
3619
3620         vm_object_lock_assert_exclusive(backing_object);
3621
3622 #if     TASK_SWAPPER
3623         /*
3624          *      Do object reference in-line to
3625          *      conditionally increment shadow's
3626          *      residence count.  If object is not
3627          *      resident, leave residence count
3628          *      on shadow alone.
3629          */
3630         if (backing_object->shadow != VM_OBJECT_NULL) {
3631                 vm_object_lock(backing_object->shadow);
3632                 vm_object_lock_assert_exclusive(backing_object->shadow);
3633                 backing_object->shadow->ref_count++;
3634                 if (object->res_count != 0)
3635                         vm_object_res_reference(backing_object->shadow);
3636                 vm_object_unlock(backing_object->shadow);
3637         }
3638 #else   /* TASK_SWAPPER */
3639         vm_object_reference(backing_object->shadow);
3640 #endif  /* TASK_SWAPPER */
3641
3642         assert(!object->phys_contiguous);
3643         assert(!backing_object->phys_contiguous);
3644         object->shadow = backing_object->shadow;
3645         if (object->shadow) {
3646                 object->shadow_offset += backing_object->shadow_offset;
3647         } else {
3648                 /* no shadow, therefore no shadow offset... */
3649                 object->shadow_offset = 0;
3650         }
3651
3652         /*
3653          *      Backing object might have had a copy pointer
3654          *      to us.  If it did, clear it.
3655          */
3656         if (backing_object->copy == object) {
3657                 backing_object->copy = VM_OBJECT_NULL;
3658         }
3659
3660         /*
3661          *      Drop the reference count on backing_object.
3662 #if     TASK_SWAPPER
3663          *      Since its ref_count was at least 2, it
3664          *      will not vanish; so we don't need to call
3665          *      vm_object_deallocate.
3666          *      [FBDP: that doesn't seem to be true any more]
3667          *
3668          *      The res_count on the backing object is
3669          *      conditionally decremented.  It's possible
3670          *      (via vm_pageout_scan) to get here with
3671          *      a "swapped" object, which has a 0 res_count,
3672          *      in which case, the backing object res_count
3673          *      is already down by one.
3674 #else
3675          *      Don't call vm_object_deallocate unless
3676          *      ref_count drops to zero.
3677          *
3678          *      The ref_count can drop to zero here if the
3679          *      backing object could be bypassed but not
3680          *      collapsed, such as when the backing object
3681          *      is temporary and cachable.
3682 #endif
3683          */
3684         if (backing_object->ref_count > 1) {
3685                 vm_object_lock_assert_exclusive(backing_object);
3686                 backing_object->ref_count--;
3687 #if     TASK_SWAPPER
3688                 if (object->res_count != 0)
3689                         vm_object_res_deallocate(backing_object);
3690                 assert(backing_object->ref_count > 0);
3691 #endif  /* TASK_SWAPPER */
3692                 vm_object_unlock(backing_object);
3693         } else {
3694
3695                 /*
3696                  *      Drop locks so that we can deallocate
3697                  *      the backing object.
3698                  */
3699
3700 #if     TASK_SWAPPER
3701                 if (object->res_count == 0) {
3702                         /* XXX get a reference for the deallocate below */
3703                         vm_object_res_reference(backing_object);
3704                 }
3705 #endif  /* TASK_SWAPPER */
3706                 vm_object_unlock(object);
3707                 vm_object_unlock(backing_object);
3708                 vm_object_deallocate(backing_object);
3709
3710                 /*
3711                  *      Relock object. We don't have to reverify
3712                  *      its state since vm_object_collapse will
3713                  *      do that for us as it starts at the
3714                  *      top of its loop.
3715                  */
3716
3717                 vm_object_lock(object);
3718         }
3719
3720         object_bypasses++;
3721 }
3722
3723
3724 /*
3725  *      vm_object_collapse:
3726  *
3727  *      Perform an object collapse or an object bypass if appropriate.
3728  *      The real work of collapsing and bypassing is performed in
3729  *      the routines vm_object_do_collapse and vm_object_do_bypass.
3730  *
3731  *      Requires that the object be locked and the page queues be unlocked.
3732  *
3733  */
3734 static unsigned long vm_object_collapse_calls = 0;
3735 static unsigned long vm_object_collapse_objects = 0;
3736 static unsigned long vm_object_collapse_do_collapse = 0;
3737 static unsigned long vm_object_collapse_do_bypass = 0;
3738 static unsigned long vm_object_collapse_delays = 0;
3739 __private_extern__ void
3740 vm_object_collapse(
3741         register vm_object_t                    object,
3742         register vm_object_offset_t             hint_offset,
3743         boolean_t                               can_bypass)
3744 {
3745         register vm_object_t                    backing_object;
3746         register unsigned int                   rcount;
3747         register unsigned int                   size;
3748         vm_object_t                             original_object;
3749
3750         vm_object_collapse_calls++;
3751
3752         if (! vm_object_collapse_allowed &&
3753             ! (can_bypass && vm_object_bypass_allowed)) {
3754                 return;
3755         }
3756
3757         XPR(XPR_VM_OBJECT, "vm_object_collapse, obj 0x%X\n",
3758                 (integer_t)object, 0,0,0,0);
3759
3760         if (object == VM_OBJECT_NULL)
3761                 return;
3762
3763         original_object = object;
3764
3765         while (TRUE) {
3766                 vm_object_collapse_objects++;
3767                 /*
3768                  *      Verify that the conditions are right for either
3769                  *      collapse or bypass:
3770                  */
3771
3772                 /*
3773                  *      There is a backing object, and
3774                  */
3775
3776                 backing_object = object->shadow;
3777                 if (backing_object == VM_OBJECT_NULL) {
3778                         if (object != original_object) {
3779                                 vm_object_unlock(object);
3780                         }
3781                         return;
3782                 }
3783
3784                 /*
3785                  *      No pages in the object are currently
3786                  *      being paged out, and
3787                  */
3788                 if (object->paging_in_progress != 0) {
3789                         /* try and collapse the rest of the shadow chain */
3790                         vm_object_lock(backing_object);
3791                         if (object != original_object) {
3792                                 vm_object_unlock(object);
3793                         }
3794                         object = backing_object;
3795                         continue;
3796                 }
3797
3798                 vm_object_lock(backing_object);
3799
3800                 /*
3801                  *      ...
3802                  *              The backing object is not read_only,
3803                  *              and no pages in the backing object are
3804                  *              currently being paged out.
3805                  *              The backing object is internal.
3806                  *
3807                  */
3808
3809                 if (!backing_object->internal ||
3810                     backing_object->paging_in_progress != 0) {
3811                         /* try and collapse the rest of the shadow chain */
3812                         if (object != original_object) {
3813                                 vm_object_unlock(object);
3814                         }
3815                         object = backing_object;
3816                         continue;
3817                 }
3818
3819                 /*
3820                  *      The backing object can't be a copy-object:
3821                  *      the shadow_offset for the copy-object must stay
3822                  *      as 0.  Furthermore (for the 'we have all the
3823                  *      pages' case), if we bypass backing_object and
3824                  *      just shadow the next object in the chain, old
3825                  *      pages from that object would then have to be copied
3826                  *      BOTH into the (former) backing_object and into the
3827                  *      parent object.
3828                  */
3829                 if (backing_object->shadow != VM_OBJECT_NULL &&
3830                     backing_object->shadow->copy == backing_object) {
3831                         /* try and collapse the rest of the shadow chain */
3832                         if (object != original_object) {
3833                                 vm_object_unlock(object);
3834                         }
3835                         object = backing_object;
3836                         continue;
3837                 }
3838
3839                 /*
3840                  *      We can now try to either collapse the backing
3841                  *      object (if the parent is the only reference to
3842                  *      it) or (perhaps) remove the parent's reference
3843                  *      to it.
3844                  *
3845                  *      If there is exactly one reference to the backing
3846                  *      object, we may be able to collapse it into the
3847                  *      parent.
3848                  *
3849                  *      If MACH_PAGEMAP is defined:
3850                  *      The parent must not have a pager created for it,
3851                  *      since collapsing a backing_object dumps new pages
3852                  *      into the parent that its pager doesn't know about
3853                  *      (and the collapse code can't merge the existence
3854                  *      maps).
3855                  *      Otherwise:
3856                  *      As long as one of the objects is still not known
3857                  *      to the pager, we can collapse them.
3858                  */
3859                 if (backing_object->ref_count == 1 &&
3860                     (!object->pager_created
3861 #if     !MACH_PAGEMAP
3862                      || !backing_object->pager_created
3863 #endif  /*!MACH_PAGEMAP */
3864                     ) && vm_object_collapse_allowed) {
3865
3866                         XPR(XPR_VM_OBJECT,
3867                    "vm_object_collapse: %x to %x, pager %x, pager_control %x\n",
3868                                 (integer_t)backing_object, (integer_t)object,
3869                                 (integer_t)backing_object->pager,
3870                                 (integer_t)backing_object->pager_control, 0);
3871
3872                         /*
3873                          *      We need the cache lock for collapsing,
3874                          *      but we must not deadlock.
3875                          */
3876
3877                         if (! vm_object_cache_lock_try()) {
3878                                 if (object != original_object) {
3879                                         vm_object_unlock(object);
3880                                 }
3881                                 vm_object_unlock(backing_object);
3882                                 return;
3883                         }
3884
3885                         /*
3886                          *      Collapse the object with its backing
3887                          *      object, and try again with the object's
3888                          *      new backing object.
3889                          */
3890
3891                         vm_object_do_collapse(object, backing_object);
3892                         vm_object_collapse_do_collapse++;
3893                         continue;
3894                 }
3895
3896                 /*
3897                  *      Collapsing the backing object was not possible
3898                  *      or permitted, so let's try bypassing it.
3899                  */
3900
3901                 if (! (can_bypass && vm_object_bypass_allowed)) {
3902                         /* try and collapse the rest of the shadow chain */
3903                         if (object != original_object) {
3904                                 vm_object_unlock(object);
3905                         }
3906                         object = backing_object;
3907                         continue;
3908                 }
3909
3910
3911                 /*
3912                  *      If the object doesn't have all its pages present,
3913                  *      we have to make sure no pages in the backing object
3914                  *      "show through" before bypassing it.
3915                  */
3916                 size = atop(object->size);
3917                 rcount = object->resident_page_count;
3918                 if (rcount != size) {
3919                         vm_object_offset_t      offset;
3920                         vm_object_offset_t      backing_offset;
3921                         unsigned int            backing_rcount;
3922                         unsigned int            lookups = 0;
3923
3924                         /*
3925                          *      If the backing object has a pager but no pagemap,
3926                          *      then we cannot bypass it, because we don't know
3927                          *      what pages it has.
3928                          */
3929                         if (backing_object->pager_created
3930 #if     MACH_PAGEMAP
3931                                 && (backing_object->existence_map == VM_EXTERNAL_NULL)
3932 #endif  /* MACH_PAGEMAP */
3933                                 ) {
3934                                 /* try and collapse the rest of the shadow chain */
3935                                 if (object != original_object) {
3936                                         vm_object_unlock(object);
3937                                 }
3938                                 object = backing_object;
3939                                 continue;
3940                         }
3941
3942                         /*
3943                          *      If the object has a pager but no pagemap,
3944                          *      then we cannot bypass it, because we don't know
3945                          *      what pages it has.
3946                          */
3947                         if (object->pager_created
3948 #if     MACH_PAGEMAP
3949                                 && (object->existence_map == VM_EXTERNAL_NULL)
3950 #endif  /* MACH_PAGEMAP */
3951                                 ) {
3952                                 /* try and collapse the rest of the shadow chain */
3953                                 if (object != original_object) {
3954                                         vm_object_unlock(object);
3955                                 }
3956                                 object = backing_object;
3957                                 continue;
3958                         }
3959
3960                         /*
3961                          *      If all of the pages in the backing object are
3962                          *      shadowed by the parent object, the parent
3963                          *      object no longer has to shadow the backing
3964                          *      object; it can shadow the next one in the
3965                          *      chain.
3966                          *
3967                          *      If the backing object has existence info,
3968                          *      we must check examine its existence info
3969                          *      as well.
3970                          *
3971                          */
3972
3973                         backing_offset = object->shadow_offset;
3974                         backing_rcount = backing_object->resident_page_count;
3975
3976 #if     MACH_PAGEMAP
3977 #define EXISTS_IN_OBJECT(obj, off, rc) \
3978         (vm_external_state_get((obj)->existence_map, \
3979          (vm_offset_t)(off)) == VM_EXTERNAL_STATE_EXISTS || \
3980          ((rc) && ++lookups && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--))
3981 #else
3982 #define EXISTS_IN_OBJECT(obj, off, rc) \
3983         (((rc) && ++lookups && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--))
3984 #endif  /* MACH_PAGEMAP */
3985
3986                         /*
3987                          * Check the hint location first
3988                          * (since it is often the quickest way out of here).
3989                          */
3990                         if (object->cow_hint != ~(vm_offset_t)0)
3991                                 hint_offset = (vm_object_offset_t)object->cow_hint;
3992                         else
3993                                 hint_offset = (hint_offset > 8 * PAGE_SIZE_64) ?
3994                                               (hint_offset - 8 * PAGE_SIZE_64) : 0;
3995
3996                         if (EXISTS_IN_OBJECT(backing_object, hint_offset +
3997                                              backing_offset, backing_rcount) &&
3998                             !EXISTS_IN_OBJECT(object, hint_offset, rcount)) {
3999                                 /* dependency right at the hint */
4000                                 object->cow_hint = (vm_offset_t)hint_offset;
4001                                 /* try and collapse the rest of the shadow chain */
4002                                 if (object != original_object) {
4003                                         vm_object_unlock(object);
4004                                 }
4005                                 object = backing_object;
4006                                 continue;
4007                         }
4008
4009                         /*
4010                          * If the object's window onto the backing_object
4011                          * is large compared to the number of resident
4012                          * pages in the backing object, it makes sense to
4013                          * walk the backing_object's resident pages first.
4014                          *
4015                          * NOTE: Pages may be in both the existence map and
4016                          * resident.  So, we can't permanently decrement
4017                          * the rcount here because the second loop may
4018                          * find the same pages in the backing object'
4019                          * existence map that we found here and we would
4020                          * double-decrement the rcount.  We also may or
4021                          * may not have found the
4022                          */
4023                         if (backing_rcount &&
4024 #if     MACH_PAGEMAP
4025                             size > ((backing_object->existence_map) ?
4026                              backing_rcount : (backing_rcount >> 1))
4027 #else
4028                             size > (backing_rcount >> 1)
4029 #endif  /* MACH_PAGEMAP */
4030                                 ) {
4031                                 unsigned int rc = rcount;
4032                                 vm_page_t p;
4033
4034                                 backing_rcount = backing_object->resident_page_count;
4035                                 p = (vm_page_t)queue_first(&backing_object->memq);
4036                                 do {
4037                                         /* Until we get more than one lookup lock */
4038                                         if (lookups > 256) {
4039                                                 vm_object_collapse_delays++;
4040                                                 lookups = 0;
4041                                                 mutex_pause(0);
4042                                         }
4043
4044                                         offset = (p->offset - backing_offset);
4045                                         if (offset < object->size &&
4046                                             offset != hint_offset &&
4047                                             !EXISTS_IN_OBJECT(object, offset, rc)) {
4048                                                 /* found a dependency */
4049                                                 object->cow_hint = (vm_offset_t)offset;
4050                                                 break;
4051                                         }
4052                                         p = (vm_page_t) queue_next(&p->listq);
4053
4054                                 } while (--backing_rcount);
4055                                 if (backing_rcount != 0 ) {
4056                                         /* try and collapse the rest of the shadow chain */
4057                                         if (object != original_object) {
4058                                                 vm_object_unlock(object);
4059                                         }
4060                                         object = backing_object;
4061                                         continue;
4062                                 }
4063                         }
4064
4065                         /*
4066                          * Walk through the offsets looking for pages in the
4067                          * backing object that show through to the object.
4068                          */
4069 #if     MACH_PAGEMAP
4070                         if (backing_rcount || backing_object->existence_map) {
4071 #else
4072                         if (backing_rcount) {
4073 #endif  /* MACH_PAGEMAP */
4074                                 offset = hint_offset;
4075
4076                                 while((offset =
4077                                       (offset + PAGE_SIZE_64 < object->size) ?
4078                                       (offset + PAGE_SIZE_64) : 0) != hint_offset) {
4079
4080                                         /* Until we get more than one lookup lock */
4081                                         if (lookups > 256) {
4082                                                 vm_object_collapse_delays++;
4083                                                 lookups = 0;
4084                                                 mutex_pause(0);
4085                                         }
4086
4087                                         if (EXISTS_IN_OBJECT(backing_object, offset +
4088                                             backing_offset, backing_rcount) &&
4089                                             !EXISTS_IN_OBJECT(object, offset, rcount)) {
4090                                                 /* found a dependency */
4091                                                 object->cow_hint = (vm_offset_t)offset;
4092                                                 break;
4093                                         }
4094                                 }
4095                                 if (offset != hint_offset) {
4096                                         /* try and collapse the rest of the shadow chain */
4097                                         if (object != original_object) {
4098                                                 vm_object_unlock(object);
4099                                         }
4100                                         object = backing_object;
4101                                         continue;
4102                                 }
4103                         }
4104                 }
4105
4106                 /* reset the offset hint for any objects deeper in the chain */
4107                 object->cow_hint = (vm_offset_t)0;
4108
4109                 /*
4110                  *      All interesting pages in the backing object
4111                  *      already live in the parent or its pager.
4112                  *      Thus we can bypass the backing object.
4113                  */
4114
4115                 vm_object_do_bypass(object, backing_object);
4116                 vm_object_collapse_do_bypass++;
4117
4118                 /*
4119                  *      Try again with this object's new backing object.
4120                  */
4121
4122                 continue;
4123         }
4124
4125         if (object != original_object) {
4126                 vm_object_unlock(object);
4127         }
4128 }
4129
4130 /*
4131  *      Routine:        vm_object_page_remove: [internal]
4132  *      Purpose:
4133  *              Removes all physical pages in the specified
4134  *              object range from the object's list of pages.
4135  *
4136  *      In/out conditions:
4137  *              The object must be locked.
4138  *              The object must not have paging_in_progress, usually
4139  *              guaranteed by not having a pager.
4140  */
4141 unsigned int vm_object_page_remove_lookup = 0;
4142 unsigned int vm_object_page_remove_iterate = 0;
4143
4144 __private_extern__ void
4145 vm_object_page_remove(
4146         register vm_object_t            object,
4147         register vm_object_offset_t     start,
4148         register vm_object_offset_t     end)
4149 {
4150         register vm_page_t      p, next;
4151
4152         /*
4153          *      One and two page removals are most popular.
4154          *      The factor of 16 here is somewhat arbitrary.
4155          *      It balances vm_object_lookup vs iteration.
4156          */
4157
4158         if (atop_64(end - start) < (unsigned)object->resident_page_count/16) {
4159                 vm_object_page_remove_lookup++;
4160
4161                 for (; start < end; start += PAGE_SIZE_64) {
4162                         p = vm_page_lookup(object, start);
4163                         if (p != VM_PAGE_NULL) {
4164                                 assert(!p->cleaning && !p->pageout);
4165                                 if (!p->fictitious && p->pmapped)
4166                                         pmap_disconnect(p->phys_page);
4167                                 VM_PAGE_FREE(p);
4168                         }
4169                 }
4170         } else {
4171                 vm_object_page_remove_iterate++;
4172
4173                 p = (vm_page_t) queue_first(&object->memq);
4174                 while (!queue_end(&object->memq, (queue_entry_t) p)) {
4175                         next = (vm_page_t) queue_next(&p->listq);
4176                         if ((start <= p->offset) && (p->offset < end)) {
4177                                 assert(!p->cleaning && !p->pageout);
4178                                 if (!p->fictitious && p->pmapped)
4179                                         pmap_disconnect(p->phys_page);
4180                                 VM_PAGE_FREE(p);
4181                         }
4182                         p = next;
4183                 }
4184         }
4185 }
4186
4187
4188 /*
4189  *      Routine:        vm_object_coalesce
4190  *      Function:       Coalesces two objects backing up adjoining
4191  *                      regions of memory into a single object.
4192  *
4193  *      returns TRUE if objects were combined.
4194  *
4195  *      NOTE:   Only works at the moment if the second object is NULL -
4196  *              if it's not, which object do we lock first?
4197  *
4198  *      Parameters:
4199  *              prev_object     First object to coalesce
4200  *              prev_offset     Offset into prev_object
4201  *              next_object     Second object into coalesce
4202  *              next_offset     Offset into next_object
4203  *
4204  *              prev_size       Size of reference to prev_object
4205  *              next_size       Size of reference to next_object
4206  *
4207  *      Conditions:
4208  *      The object(s) must *not* be locked. The map must be locked
4209  *      to preserve the reference to the object(s).
4210  */
4211 static int vm_object_coalesce_count = 0;
4212
4213 __private_extern__ boolean_t
4214 vm_object_coalesce(
4215         register vm_object_t            prev_object,
4216         vm_object_t                     next_object,
4217         vm_object_offset_t              prev_offset,
4218         __unused vm_object_offset_t next_offset,
4219         vm_object_size_t                prev_size,
4220         vm_object_size_t                next_size)
4221 {
4222         vm_object_size_t        newsize;
4223
4224 #ifdef  lint
4225         next_offset++;
4226 #endif  /* lint */
4227
4228         if (next_object != VM_OBJECT_NULL) {
4229                 return(FALSE);
4230         }
4231
4232         if (prev_object == VM_OBJECT_NULL) {
4233                 return(TRUE);
4234         }
4235
4236         XPR(XPR_VM_OBJECT,
4237        "vm_object_coalesce: 0x%X prev_off 0x%X prev_size 0x%X next_size 0x%X\n",
4238                 (integer_t)prev_object, prev_offset, prev_size, next_size, 0);
4239
4240         vm_object_lock(prev_object);
4241
4242         /*
4243          *      Try to collapse the object first
4244          */
4245         vm_object_collapse(prev_object, prev_offset, TRUE);
4246
4247         /*
4248          *      Can't coalesce if pages not mapped to
4249          *      prev_entry may be in use any way:
4250          *      . more than one reference
4251          *      . paged out
4252          *      . shadows another object
4253          *      . has a copy elsewhere
4254          *      . is purgeable
4255          *      . paging references (pages might be in page-list)
4256          */
4257
4258         if ((prev_object->ref_count > 1) ||
4259             prev_object->pager_created ||
4260             (prev_object->shadow != VM_OBJECT_NULL) ||
4261             (prev_object->copy != VM_OBJECT_NULL) ||
4262             (prev_object->true_share != FALSE) ||
4263             (prev_object->purgable != VM_PURGABLE_DENY) ||
4264             (prev_object->paging_in_progress != 0)) {
4265                 vm_object_unlock(prev_object);
4266                 return(FALSE);
4267         }
4268
4269         vm_object_coalesce_count++;
4270
4271         /*
4272          *      Remove any pages that may still be in the object from
4273          *      a previous deallocation.
4274          */
4275         vm_object_page_remove(prev_object,
4276                 prev_offset + prev_size,
4277                 prev_offset + prev_size + next_size);
4278
4279         /*
4280          *      Extend the object if necessary.
4281          */
4282         newsize = prev_offset + prev_size + next_size;
4283         if (newsize > prev_object->size) {
4284 #if     MACH_PAGEMAP
4285                 /*
4286                  *      We cannot extend an object that has existence info,
4287                  *      since the existence info might then fail to cover
4288                  *      the entire object.
4289                  *
4290                  *      This assertion must be true because the object
4291                  *      has no pager, and we only create existence info
4292                  *      for objects with pagers.
4293                  */
4294                 assert(prev_object->existence_map == VM_EXTERNAL_NULL);
4295 #endif  /* MACH_PAGEMAP */
4296                 prev_object->size = newsize;
4297         }
4298
4299         vm_object_unlock(prev_object);
4300         return(TRUE);
4301 }
4302
4303 /*
4304  *      Attach a set of physical pages to an object, so that they can
4305  *      be mapped by mapping the object.  Typically used to map IO memory.
4306  *
4307  *      The mapping function and its private data are used to obtain the
4308  *      physical addresses for each page to be mapped.
4309  */
4310 void
4311 vm_object_page_map(
4312         vm_object_t             object,
4313         vm_object_offset_t      offset,
4314         vm_object_size_t        size,
4315         vm_object_offset_t      (*map_fn)(void *map_fn_data,
4316                 vm_object_offset_t offset),
4317                 void            *map_fn_data)   /* private to map_fn */
4318 {
4319         int     num_pages;
4320         int     i;
4321         vm_page_t       m;
4322         vm_page_t       old_page;
4323         vm_object_offset_t      addr;
4324
4325         num_pages = atop_64(size);
4326
4327         for (i = 0; i < num_pages; i++, offset += PAGE_SIZE_64) {
4328
4329             addr = (*map_fn)(map_fn_data, offset);
4330
4331             while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL)
4332                 vm_page_more_fictitious();
4333
4334             vm_object_lock(object);
4335             if ((old_page = vm_page_lookup(object, offset))
4336                         != VM_PAGE_NULL)
4337             {
4338                 vm_page_lock_queues();
4339                 vm_page_free(old_page);
4340                 vm_page_unlock_queues();
4341             }
4342
4343             vm_page_init(m, addr);
4344             /* private normally requires lock_queues but since we */
4345             /* are initializing the page, its not necessary here  */
4346             m->private = TRUE;          /* don`t free page */
4347             m->wire_count = 1;
4348             vm_page_insert(m, object, offset);
4349
4350             PAGE_WAKEUP_DONE(m);
4351             vm_object_unlock(object);
4352         }
4353 }
4354
4355 #include <mach_kdb.h>
4356
4357 #if     MACH_KDB
4358 #include <ddb/db_output.h>
4359 #include <vm/vm_print.h>
4360
4361 #define printf  kdbprintf
4362
4363 extern boolean_t        vm_object_cached(
4364                                 vm_object_t object);
4365
4366 extern void             print_bitstring(
4367                                 char byte);
4368
4369 boolean_t       vm_object_print_pages = FALSE;
4370
4371 void
4372 print_bitstring(
4373         char byte)
4374 {
4375         printf("%c%c%c%c%c%c%c%c",
4376                ((byte & (1 << 0)) ? '1' : '0'),
4377                ((byte & (1 << 1)) ? '1' : '0'),
4378                ((byte & (1 << 2)) ? '1' : '0'),
4379                ((byte & (1 << 3)) ? '1' : '0'),
4380                ((byte & (1 << 4)) ? '1' : '0'),
4381                ((byte & (1 << 5)) ? '1' : '0'),
4382                ((byte & (1 << 6)) ? '1' : '0'),
4383                ((byte & (1 << 7)) ? '1' : '0'));
4384 }
4385
4386 boolean_t
4387 vm_object_cached(
4388         register vm_object_t object)
4389 {
4390         register vm_object_t o;
4391
4392         queue_iterate(&vm_object_cached_list, o, vm_object_t, cached_list) {
4393                 if (object == o) {
4394                         return TRUE;
4395                 }
4396         }
4397         return FALSE;
4398 }
4399
4400 #if     MACH_PAGEMAP
4401 /*
4402  *      vm_external_print:      [ debug ]
4403  */
4404 void
4405 vm_external_print(
4406         vm_external_map_t       emap,
4407         vm_size_t               size)
4408 {
4409         if (emap == VM_EXTERNAL_NULL) {
4410                 printf("0  ");
4411         } else {
4412                 vm_size_t existence_size = stob(size);
4413                 printf("{ size=%d, map=[", existence_size);
4414                 if (existence_size > 0) {
4415                         print_bitstring(emap[0]);
4416                 }
4417                 if (existence_size > 1) {
4418                         print_bitstring(emap[1]);
4419                 }
4420                 if (existence_size > 2) {
4421                         printf("...");
4422                         print_bitstring(emap[existence_size-1]);
4423                 }
4424                 printf("] }\n");
4425         }
4426         return;
4427 }
4428 #endif  /* MACH_PAGEMAP */
4429
4430 int
4431 vm_follow_object(
4432         vm_object_t object)
4433 {
4434         int count = 0;
4435         int orig_db_indent = db_indent;
4436
4437         while (TRUE) {
4438                 if (object == VM_OBJECT_NULL) {
4439                         db_indent = orig_db_indent;
4440                         return count;
4441                 }
4442
4443                 count += 1;
4444
4445                 iprintf("object 0x%x", object);
4446                 printf(", shadow=0x%x", object->shadow);
4447                 printf(", copy=0x%x", object->copy);
4448                 printf(", pager=0x%x", object->pager);
4449                 printf(", ref=%d\n", object->ref_count);
4450
4451                 db_indent += 2;
4452                 object = object->shadow;
4453         }
4454
4455 }
4456
4457 /*
4458  *      vm_object_print:        [ debug ]
4459  */
4460 void
4461 vm_object_print(db_expr_t db_addr, __unused boolean_t have_addr,
4462                 __unused db_expr_t arg_count, __unused char *modif)
4463 {
4464         vm_object_t     object;
4465         register vm_page_t p;
4466         const char *s;
4467
4468         register int count;
4469
4470         object = (vm_object_t) (long) db_addr;
4471         if (object == VM_OBJECT_NULL)
4472                 return;
4473
4474         iprintf("object 0x%x\n", object);
4475
4476         db_indent += 2;
4477
4478         iprintf("size=0x%x", object->size);
4479         printf(", memq_hint=%p", object->memq_hint);
4480         printf(", ref_count=%d\n", object->ref_count);
4481         iprintf("");
4482 #if     TASK_SWAPPER
4483         printf("res_count=%d, ", object->res_count);
4484 #endif  /* TASK_SWAPPER */
4485         printf("resident_page_count=%d\n", object->resident_page_count);
4486
4487         iprintf("shadow=0x%x", object->shadow);
4488         if (object->shadow) {
4489                 register int i = 0;
4490                 vm_object_t shadow = object;
4491                 while((shadow = shadow->shadow))
4492                         i++;
4493                 printf(" (depth %d)", i);
4494         }
4495         printf(", copy=0x%x", object->copy);
4496         printf(", shadow_offset=0x%x", object->shadow_offset);
4497         printf(", last_alloc=0x%x\n", object->last_alloc);
4498
4499         iprintf("pager=0x%x", object->pager);
4500         printf(", paging_offset=0x%x", object->paging_offset);
4501         printf(", pager_control=0x%x\n", object->pager_control);
4502
4503         iprintf("copy_strategy=%d[", object->copy_strategy);
4504         switch (object->copy_strategy) {
4505                 case MEMORY_OBJECT_COPY_NONE:
4506                 printf("copy_none");
4507                 break;
4508
4509                 case MEMORY_OBJECT_COPY_CALL:
4510                 printf("copy_call");
4511                 break;
4512
4513                 case MEMORY_OBJECT_COPY_DELAY:
4514                 printf("copy_delay");
4515                 break;
4516
4517                 case MEMORY_OBJECT_COPY_SYMMETRIC:
4518                 printf("copy_symmetric");
4519                 break;
4520
4521                 case MEMORY_OBJECT_COPY_INVALID:
4522                 printf("copy_invalid");
4523                 break;
4524
4525                 default:
4526                 printf("?");
4527         }
4528         printf("]");
4529
4530         iprintf("all_wanted=0x%x<", object->all_wanted);
4531         s = "";
4532         if (vm_object_wanted(object, VM_OBJECT_EVENT_INITIALIZED)) {
4533                 printf("%sinit", s);
4534                 s = ",";
4535         }
4536         if (vm_object_wanted(object, VM_OBJECT_EVENT_PAGER_READY)) {
4537                 printf("%sready", s);
4538                 s = ",";
4539         }
4540         if (vm_object_wanted(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS)) {
4541                 printf("%spaging", s);
4542                 s = ",";
4543         }
4544         if (vm_object_wanted(object, VM_OBJECT_EVENT_LOCK_IN_PROGRESS)) {
4545                 printf("%slock", s);
4546                 s = ",";
4547         }
4548         if (vm_object_wanted(object, VM_OBJECT_EVENT_UNCACHING)) {
4549                 printf("%suncaching", s);
4550                 s = ",";
4551         }
4552         if (vm_object_wanted(object, VM_OBJECT_EVENT_COPY_CALL)) {
4553                 printf("%scopy_call", s);
4554                 s = ",";
4555         }
4556         if (vm_object_wanted(object, VM_OBJECT_EVENT_CACHING)) {
4557                 printf("%scaching", s);
4558                 s = ",";
4559         }
4560         printf(">");
4561         printf(", paging_in_progress=%d\n", object->paging_in_progress);
4562
4563         iprintf("%screated, %sinit, %sready, %spersist, %strusted, %spageout, %s, %s\n",
4564                 (object->pager_created ? "" : "!"),
4565                 (object->pager_initialized ? "" : "!"),
4566                 (object->pager_ready ? "" : "!"),
4567                 (object->can_persist ? "" : "!"),
4568                 (object->pager_trusted ? "" : "!"),
4569                 (object->pageout ? "" : "!"),
4570                 (object->internal ? "internal" : "external"),
4571                 (object->temporary ? "temporary" : "permanent"));
4572         iprintf("%salive, %spurgeable, %spurgeable_volatile, %spurgeable_empty, %sshadowed, %scached, %sprivate\n",
4573                 (object->alive ? "" : "!"),
4574                 ((object->purgable != VM_PURGABLE_DENY) ? "" : "!"),
4575                 ((object->purgable == VM_PURGABLE_VOLATILE) ? "" : "!"),
4576                 ((object->purgable == VM_PURGABLE_EMPTY) ? "" : "!"),
4577                 (object->shadowed ? "" : "!"),
4578                 (vm_object_cached(object) ? "" : "!"),
4579                 (object->private ? "" : "!"));
4580         iprintf("%sadvisory_pageout, %ssilent_overwrite\n",
4581                 (object->advisory_pageout ? "" : "!"),
4582                 (object->silent_overwrite ? "" : "!"));
4583
4584 #if     MACH_PAGEMAP
4585         iprintf("existence_map=");
4586         vm_external_print(object->existence_map, object->size);
4587 #endif  /* MACH_PAGEMAP */
4588 #if     MACH_ASSERT
4589         iprintf("paging_object=0x%x\n", object->paging_object);
4590 #endif  /* MACH_ASSERT */
4591
4592         if (vm_object_print_pages) {
4593                 count = 0;
4594                 p = (vm_page_t) queue_first(&object->memq);
4595                 while (!queue_end(&object->memq, (queue_entry_t) p)) {
4596                         if (count == 0) {
4597                                 iprintf("memory:=");
4598                         } else if (count == 2) {
4599                                 printf("\n");
4600                                 iprintf(" ...");
4601                                 count = 0;
4602                         } else {
4603                                 printf(",");
4604                         }
4605                         count++;
4606
4607                         printf("(off=0x%llX,page=%p)", p->offset, p);
4608                         p = (vm_page_t) queue_next(&p->listq);
4609                 }
4610                 if (count != 0) {
4611                         printf("\n");
4612                 }
4613         }
4614         db_indent -= 2;
4615 }
4616
4617
4618 /*
4619  *      vm_object_find          [ debug ]
4620  *
4621  *      Find all tasks which reference the given vm_object.
4622  */
4623
4624 boolean_t vm_object_find(vm_object_t object);
4625 boolean_t vm_object_print_verbose = FALSE;
4626
4627 boolean_t
4628 vm_object_find(
4629         vm_object_t     object)
4630 {
4631         task_t task;
4632         vm_map_t map;
4633         vm_map_entry_t entry;
4634         boolean_t found = FALSE;
4635
4636         queue_iterate(&tasks, task, task_t, tasks) {
4637                 map = task->map;
4638                 for (entry = vm_map_first_entry(map);
4639                          entry && entry != vm_map_to_entry(map);
4640                          entry = entry->vme_next) {
4641
4642                         vm_object_t obj;
4643
4644                         /*
4645                          * For the time being skip submaps,
4646                          * only the kernel can have submaps,
4647                          * and unless we are interested in
4648                          * kernel objects, we can simply skip
4649                          * submaps. See sb/dejan/nmk18b7/src/mach_kernel/vm
4650                          * for a full solution.
4651                          */
4652                         if (entry->is_sub_map)
4653                                 continue;
4654                         if (entry)
4655                                 obj = entry->object.vm_object;
4656                         else
4657                                 continue;
4658
4659                         while (obj != VM_OBJECT_NULL) {
4660                                 if (obj == object) {
4661                                         if (!found) {
4662                                                 printf("TASK\t\tMAP\t\tENTRY\n");
4663                                                 found = TRUE;
4664                                         }
4665                                         printf("0x%x\t0x%x\t0x%x\n",
4666                                                    task, map, entry);
4667                                 }
4668                                 obj = obj->shadow;
4669                         }
4670                 }
4671         }
4672
4673         return(found);
4674 }
4675
4676 #endif  /* MACH_KDB */
4677
4678 kern_return_t
4679 vm_object_populate_with_private(
4680                 vm_object_t             object,
4681                 vm_object_offset_t      offset,
4682                 ppnum_t                 phys_page,
4683                 vm_size_t               size)
4684 {
4685         ppnum_t                 base_page;
4686         vm_object_offset_t      base_offset;
4687
4688
4689         if(!object->private)
4690                 return KERN_FAILURE;
4691
4692         base_page = phys_page;
4693
4694         vm_object_lock(object);
4695         if(!object->phys_contiguous) {
4696                 vm_page_t       m;
4697                 if((base_offset = trunc_page_64(offset)) != offset) {
4698                         vm_object_unlock(object);
4699                         return KERN_FAILURE;
4700                 }
4701                 base_offset += object->paging_offset;
4702                 while(size) {
4703                         m = vm_page_lookup(object, base_offset);
4704                         if(m != VM_PAGE_NULL) {
4705                                 if(m->fictitious) {
4706                                         if (m->phys_page !=
4707                                             vm_page_guard_addr) {
4708                                                 vm_page_lockspin_queues();
4709                                                 m->fictitious = FALSE;
4710                                                 m->private = TRUE;
4711                                                 m->phys_page = base_page;
4712                                                 if(!m->busy) {
4713                                                         m->busy = TRUE;
4714                                                 }
4715                                                 if(!m->absent) {
4716                                                         m->absent = TRUE;
4717                                                 }
4718                                                 m->list_req_pending = TRUE;
4719                                                 vm_page_unlock_queues();
4720                                         }
4721                                 } else if (m->phys_page != base_page) {
4722                                         if (m->pmapped) {
4723                                                 /*
4724                                                  * pmap call to clear old mapping
4725                                                  */
4726                                                 pmap_disconnect(m->phys_page);
4727                                         }
4728                                         m->phys_page = base_page;
4729                                 }
4730
4731                                 /*
4732                                  * ENCRYPTED SWAP:
4733                                  * We're not pointing to the same
4734                                  * physical page any longer and the
4735                                  * contents of the new one are not
4736                                  * supposed to be encrypted.
4737                                  * XXX What happens to the original
4738                                  * physical page. Is it lost ?
4739                                  */
4740                                 m->encrypted = FALSE;
4741
4742                         } else {
4743                                 while ((m = vm_page_grab_fictitious())
4744                                                          == VM_PAGE_NULL)
4745                                         vm_page_more_fictitious();
4746                                 vm_page_lockspin_queues();
4747                                 m->fictitious = FALSE;
4748                                 m->private = TRUE;
4749                                 m->phys_page = base_page;
4750                                 m->list_req_pending = TRUE;
4751                                 m->absent = TRUE;
4752                                 m->unusual = TRUE;
4753                                 vm_page_unlock_queues();
4754                                 vm_page_insert(m, object, base_offset);
4755                         }
4756                         base_page++;                                                                    /* Go to the next physical page */
4757                         base_offset += PAGE_SIZE;
4758                         size -= PAGE_SIZE;
4759                 }
4760         } else {
4761                 /* NOTE: we should check the original settings here */
4762                 /* if we have a size > zero a pmap call should be made */
4763                 /* to disable the range */
4764
4765                 /* pmap_? */
4766
4767                 /* shadows on contiguous memory are not allowed */
4768                 /* we therefore can use the offset field */
4769                 object->shadow_offset = (vm_object_offset_t)(phys_page << 12);
4770                 object->size = size;
4771         }
4772         vm_object_unlock(object);
4773         return KERN_SUCCESS;
4774 }
4775
4776 /*
4777  *      memory_object_free_from_cache:
4778  *
4779  *      Walk the vm_object cache list, removing and freeing vm_objects
4780  *      which are backed by the pager identified by the caller, (pager_ops).
4781  *      Remove up to "count" objects, if there are that may available
4782  *      in the cache.
4783  *
4784  *      Walk the list at most once, return the number of vm_objects
4785  *      actually freed.
4786  */
4787
4788 __private_extern__ kern_return_t
4789 memory_object_free_from_cache(
4790         __unused host_t         host,
4791         memory_object_pager_ops_t pager_ops,
4792         int             *count)
4793 {
4794
4795         int     object_released = 0;
4796
4797         register vm_object_t object = VM_OBJECT_NULL;
4798         vm_object_t shadow;
4799
4800 /*
4801         if(host == HOST_NULL)
4802                 return(KERN_INVALID_ARGUMENT);
4803 */
4804
4805  try_again:
4806         vm_object_cache_lock();
4807
4808         queue_iterate(&vm_object_cached_list, object,
4809                                         vm_object_t, cached_list) {
4810                 if (object->pager &&
4811                     (pager_ops == object->pager->mo_pager_ops)) {
4812                         vm_object_lock(object);
4813                         queue_remove(&vm_object_cached_list, object,
4814                                         vm_object_t, cached_list);
4815                         vm_object_cached_count--;
4816
4817                         /*
4818                         *       Since this object is in the cache, we know
4819                         *       that it is initialized and has only a pager's
4820                         *       (implicit) reference. Take a reference to avoid
4821                         *       recursive deallocations.
4822                         */
4823
4824                         assert(object->pager_initialized);
4825                         assert(object->ref_count == 0);
4826                         vm_object_lock_assert_exclusive(object);
4827                         object->ref_count++;
4828
4829                         /*
4830                         *       Terminate the object.
4831                         *       If the object had a shadow, we let
4832                         *       vm_object_deallocate deallocate it.
4833                         *       "pageout" objects have a shadow, but
4834                         *       maintain a "paging reference" rather
4835                         *       than a normal reference.
4836                         *       (We are careful here to limit recursion.)
4837                         */
4838                         shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
4839                         if ((vm_object_terminate(object) == KERN_SUCCESS)
4840                                         && (shadow != VM_OBJECT_NULL)) {
4841                                 vm_object_deallocate(shadow);
4842                         }
4843
4844                         if(object_released++ == *count)
4845                                 return KERN_SUCCESS;
4846                         goto try_again;
4847                 }
4848         }
4849         vm_object_cache_unlock();
4850         *count  = object_released;
4851         return KERN_SUCCESS;
4852 }
4853
4854
4855
4856 kern_return_t
4857 memory_object_create_named(
4858         memory_object_t pager,
4859         memory_object_offset_t  size,
4860         memory_object_control_t         *control)
4861 {
4862         vm_object_t             object;
4863         vm_object_hash_entry_t  entry;
4864
4865         *control = MEMORY_OBJECT_CONTROL_NULL;
4866         if (pager == MEMORY_OBJECT_NULL)
4867                 return KERN_INVALID_ARGUMENT;
4868
4869         vm_object_cache_lock();
4870         entry = vm_object_hash_lookup(pager, FALSE);
4871         if ((entry != VM_OBJECT_HASH_ENTRY_NULL) &&
4872                         (entry->object != VM_OBJECT_NULL)) {
4873                 if (entry->object->named == TRUE)
4874                         panic("memory_object_create_named: caller already holds the right");    }
4875
4876         vm_object_cache_unlock();
4877         if ((object = vm_object_enter(pager, size, FALSE, FALSE, TRUE))
4878             == VM_OBJECT_NULL) {
4879                 return(KERN_INVALID_OBJECT);
4880         }
4881
4882         /* wait for object (if any) to be ready */
4883         if (object != VM_OBJECT_NULL) {
4884                 vm_object_lock(object);
4885                 object->named = TRUE;
4886                 while (!object->pager_ready) {
4887                         vm_object_sleep(object,
4888                                         VM_OBJECT_EVENT_PAGER_READY,
4889                                         THREAD_UNINT);
4890                 }
4891                 *control = object->pager_control;
4892                 vm_object_unlock(object);
4893         }
4894         return (KERN_SUCCESS);
4895 }
4896
4897
4898 /*
4899  *      Routine:        memory_object_recover_named [user interface]
4900  *      Purpose:
4901  *              Attempt to recover a named reference for a VM object.
4902  *              VM will verify that the object has not already started
4903  *              down the termination path, and if it has, will optionally
4904  *              wait for that to finish.
4905  *      Returns:
4906  *              KERN_SUCCESS - we recovered a named reference on the object
4907  *              KERN_FAILURE - we could not recover a reference (object dead)
4908  *              KERN_INVALID_ARGUMENT - bad memory object control
4909  */
4910 kern_return_t
4911 memory_object_recover_named(
4912         memory_object_control_t control,
4913         boolean_t               wait_on_terminating)
4914 {
4915         vm_object_t             object;
4916
4917         vm_object_cache_lock();
4918         object = memory_object_control_to_vm_object(control);
4919         if (object == VM_OBJECT_NULL) {
4920                 vm_object_cache_unlock();
4921                 return (KERN_INVALID_ARGUMENT);
4922         }
4923
4924 restart:
4925         vm_object_lock(object);
4926
4927         if (object->terminating && wait_on_terminating) {
4928                 vm_object_cache_unlock();
4929                 vm_object_wait(object,
4930                         VM_OBJECT_EVENT_PAGING_IN_PROGRESS,
4931                         THREAD_UNINT);
4932                 vm_object_cache_lock();
4933                 goto restart;
4934         }
4935
4936         if (!object->alive) {
4937                 vm_object_cache_unlock();
4938                 vm_object_unlock(object);
4939                 return KERN_FAILURE;
4940         }
4941
4942         if (object->named == TRUE) {
4943                 vm_object_cache_unlock();
4944                 vm_object_unlock(object);
4945                 return KERN_SUCCESS;
4946         }
4947
4948         if((object->ref_count == 0) && (!object->terminating)){
4949                 queue_remove(&vm_object_cached_list, object,
4950                                      vm_object_t, cached_list);
4951                         vm_object_cached_count--;
4952                         XPR(XPR_VM_OBJECT_CACHE,
4953                        "memory_object_recover_named: removing %X, head (%X, %X)\n",
4954                             (integer_t)object,
4955                             (integer_t)vm_object_cached_list.next,
4956                             (integer_t)vm_object_cached_list.prev, 0,0);
4957         }
4958
4959         vm_object_cache_unlock();
4960
4961         object->named = TRUE;
4962         vm_object_lock_assert_exclusive(object);
4963         object->ref_count++;
4964         vm_object_res_reference(object);
4965         while (!object->pager_ready) {
4966                 vm_object_sleep(object,
4967                                 VM_OBJECT_EVENT_PAGER_READY,
4968                                 THREAD_UNINT);
4969         }
4970         vm_object_unlock(object);
4971         return (KERN_SUCCESS);
4972 }
4973
4974
4975 /*
4976  *      vm_object_release_name:
4977  *
4978  *      Enforces name semantic on memory_object reference count decrement
4979  *      This routine should not be called unless the caller holds a name
4980  *      reference gained through the memory_object_create_named.
4981  *
4982  *      If the TERMINATE_IDLE flag is set, the call will return if the
4983  *      reference count is not 1. i.e. idle with the only remaining reference
4984  *      being the name.
4985  *      If the decision is made to proceed the name field flag is set to
4986  *      false and the reference count is decremented.  If the RESPECT_CACHE
4987  *      flag is set and the reference count has gone to zero, the
4988  *      memory_object is checked to see if it is cacheable otherwise when
4989  *      the reference count is zero, it is simply terminated.
4990  */
4991
4992 __private_extern__ kern_return_t
4993 vm_object_release_name(
4994         vm_object_t     object,
4995         int             flags)
4996 {
4997         vm_object_t     shadow;
4998         boolean_t       original_object = TRUE;
4999
5000         while (object != VM_OBJECT_NULL) {
5001
5002                 /*
5003                  *      The cache holds a reference (uncounted) to
5004                  *      the object.  We must locke it before removing
5005                  *      the object.
5006                  *
5007                  */
5008
5009                 vm_object_cache_lock();
5010                 vm_object_lock(object);
5011                 assert(object->alive);
5012                 if(original_object)
5013                         assert(object->named);
5014                 assert(object->ref_count > 0);
5015
5016                 /*
5017                  *      We have to wait for initialization before
5018                  *      destroying or caching the object.
5019                  */
5020
5021                 if (object->pager_created && !object->pager_initialized) {
5022                         assert(!object->can_persist);
5023                         vm_object_assert_wait(object,
5024                                         VM_OBJECT_EVENT_INITIALIZED,
5025                                         THREAD_UNINT);
5026                         vm_object_unlock(object);
5027                         vm_object_cache_unlock();
5028                         thread_block(THREAD_CONTINUE_NULL);
5029                         continue;
5030                 }
5031
5032                 if (((object->ref_count > 1)
5033                         && (flags & MEMORY_OBJECT_TERMINATE_IDLE))
5034                         || (object->terminating)) {
5035                         vm_object_unlock(object);
5036                         vm_object_cache_unlock();
5037                         return KERN_FAILURE;
5038                 } else {
5039                         if (flags & MEMORY_OBJECT_RELEASE_NO_OP) {
5040                                 vm_object_unlock(object);
5041                                 vm_object_cache_unlock();
5042                                 return KERN_SUCCESS;
5043                         }
5044                 }
5045
5046                 if ((flags & MEMORY_OBJECT_RESPECT_CACHE) &&
5047                                         (object->ref_count == 1)) {
5048                         if(original_object)
5049                                 object->named = FALSE;
5050                         vm_object_unlock(object);
5051                         vm_object_cache_unlock();
5052                         /* let vm_object_deallocate push this thing into */
5053                         /* the cache, if that it is where it is bound */
5054                         vm_object_deallocate(object);
5055                         return KERN_SUCCESS;
5056                 }
5057                 VM_OBJ_RES_DECR(object);
5058                 shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
5059                 if(object->ref_count == 1) {
5060                         if(vm_object_terminate(object) != KERN_SUCCESS) {
5061                                 if(original_object) {
5062                                         return KERN_FAILURE;
5063                                 } else {
5064                                         return KERN_SUCCESS;
5065                                 }
5066                         }
5067                         if (shadow != VM_OBJECT_NULL) {
5068                                 original_object = FALSE;
5069                                 object = shadow;
5070                                 continue;
5071                         }
5072                         return KERN_SUCCESS;
5073                 } else {
5074                         vm_object_lock_assert_exclusive(object);
5075                         object->ref_count--;
5076                         assert(object->ref_count > 0);
5077                         if(original_object)
5078                                 object->named = FALSE;
5079                         vm_object_unlock(object);
5080                         vm_object_cache_unlock();
5081                         return KERN_SUCCESS;
5082                 }
5083         }
5084         /*NOTREACHED*/
5085         assert(0);
5086         return KERN_FAILURE;
5087 }
5088
5089
5090 __private_extern__ kern_return_t
5091 vm_object_lock_request(
5092         vm_object_t                     object,
5093         vm_object_offset_t              offset,
5094         vm_object_size_t                size,
5095         memory_object_return_t          should_return,
5096         int                             flags,
5097         vm_prot_t                       prot)
5098 {
5099         __unused boolean_t      should_flush;
5100
5101         should_flush = flags & MEMORY_OBJECT_DATA_FLUSH;
5102
5103         XPR(XPR_MEMORY_OBJECT,
5104             "vm_o_lock_request, obj 0x%X off 0x%X size 0x%X flags %X prot %X\n",
5105             (integer_t)object, offset, size,
5106             (((should_return&1)<<1)|should_flush), prot);
5107
5108         /*
5109          *      Check for bogus arguments.
5110          */
5111         if (object == VM_OBJECT_NULL)
5112                 return (KERN_INVALID_ARGUMENT);
5113
5114         if ((prot & ~VM_PROT_ALL) != 0 && prot != VM_PROT_NO_CHANGE)
5115                 return (KERN_INVALID_ARGUMENT);
5116
5117         size = round_page_64(size);
5118
5119         /*
5120          *      Lock the object, and acquire a paging reference to
5121          *      prevent the memory_object reference from being released.
5122          */
5123         vm_object_lock(object);
5124         vm_object_paging_begin(object);
5125
5126         (void)vm_object_update(object,
5127                 offset, size, NULL, NULL, should_return, flags, prot);
5128
5129         vm_object_paging_end(object);
5130         vm_object_unlock(object);
5131
5132         return (KERN_SUCCESS);
5133 }
5134
5135 /*
5136  * Empty a purgeable object by grabbing the physical pages assigned to it and
5137  * putting them on the free queue without writing them to backing store, etc.
5138  * When the pages are next touched they will be demand zero-fill pages.  We
5139  * skip pages which are busy, being paged in/out, wired, etc.  We do _not_
5140  * skip referenced/dirty pages, pages on the active queue, etc.  We're more
5141  * than happy to grab these since this is a purgeable object.  We mark the
5142  * object as "empty" after reaping its pages.
5143  *
5144  * On entry the object and page queues are locked, the object must be a
5145  * purgeable object with no delayed copies pending.
5146  */
5147 unsigned int
5148 vm_object_purge(vm_object_t object)
5149 {
5150         vm_page_t       p, next;
5151         unsigned int    num_purged_pages;
5152         vm_page_t       local_freeq;
5153         unsigned long   local_freed;
5154         int             purge_loop_quota;
5155 /* free pages as soon as we gather PURGE_BATCH_FREE_LIMIT pages to free */
5156 #define PURGE_BATCH_FREE_LIMIT  50
5157 /* release page queues lock every PURGE_LOOP_QUOTA iterations */
5158 #define PURGE_LOOP_QUOTA        100
5159
5160         num_purged_pages = 0;
5161         if (object->purgable == VM_PURGABLE_DENY)
5162                 return num_purged_pages;
5163
5164         assert(object->purgable != VM_PURGABLE_NONVOLATILE);
5165         object->purgable = VM_PURGABLE_EMPTY;
5166
5167         assert(object->copy == VM_OBJECT_NULL);
5168         assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
5169         purge_loop_quota = PURGE_LOOP_QUOTA;
5170
5171         local_freeq = VM_PAGE_NULL;
5172         local_freed = 0;
5173
5174         /*
5175          * Go through the object's resident pages and try and discard them.
5176          */
5177         next = (vm_page_t)queue_first(&object->memq);
5178         while (!queue_end(&object->memq, (queue_entry_t)next)) {
5179                 p = next;
5180                 next = (vm_page_t)queue_next(&next->listq);
5181
5182                 if (purge_loop_quota-- == 0) {
5183                         /*
5184                          * Avoid holding the page queues lock for too long.
5185                          * Let someone else take it for a while if needed.
5186                          * Keep holding the object's lock to guarantee that
5187                          * the object's page list doesn't change under us
5188                          * while we yield.
5189                          */
5190                         if (local_freeq != VM_PAGE_NULL) {
5191                                 /*
5192                                  * Flush our queue of pages to free.
5193                                  */
5194                                 vm_page_free_list(local_freeq);
5195                                 local_freeq = VM_PAGE_NULL;
5196                                 local_freed = 0;
5197                         }
5198                         mutex_yield(&vm_page_queue_lock);
5199
5200                         /* resume with the current page and a new quota */
5201                         purge_loop_quota = PURGE_LOOP_QUOTA;
5202                 }
5203
5204
5205                 if (p->busy || p->cleaning || p->laundry ||
5206                     p->list_req_pending) {
5207                         /* page is being acted upon, so don't mess with it */
5208                         continue;
5209                 }
5210                 if (p->wire_count) {
5211                         /* don't discard a wired page */
5212                         continue;
5213                 }
5214
5215                 assert(!p->laundry);
5216                 assert(p->object != kernel_object);
5217
5218                 /* we can discard this page */
5219
5220                 /* advertize that this page is in a transition state */
5221                 p->busy = TRUE;
5222
5223                 if (p->pmapped == TRUE) {
5224                         /* unmap the page */
5225                         int refmod_state;
5226
5227                         refmod_state = pmap_disconnect(p->phys_page);
5228                         if (refmod_state & VM_MEM_MODIFIED) {
5229                                 p->dirty = TRUE;
5230                         }
5231                 }
5232
5233                 if (p->dirty || p->precious) {
5234                         /* we saved the cost of cleaning this page ! */
5235                         num_purged_pages++;
5236                         vm_page_purged_count++;
5237                 }
5238
5239                 vm_page_free_prepare(p);
5240
5241                 /* ... and put it on our queue of pages to free */
5242                 assert(p->pageq.next == NULL &&
5243                        p->pageq.prev == NULL);
5244                 p->pageq.next = (queue_entry_t) local_freeq;
5245                 local_freeq = p;
5246                 if (++local_freed >= PURGE_BATCH_FREE_LIMIT) {
5247                         /* flush our queue of pages to free */
5248                         vm_page_free_list(local_freeq);
5249                         local_freeq = VM_PAGE_NULL;
5250                         local_freed = 0;
5251                 }
5252         }
5253
5254         /* flush our local queue of pages to free one last time */
5255         if (local_freeq != VM_PAGE_NULL) {
5256                 vm_page_free_list(local_freeq);
5257                 local_freeq = VM_PAGE_NULL;
5258                 local_freed = 0;
5259         }
5260
5261         return num_purged_pages;
5262 }
5263
5264 /*
5265  * vm_object_purgeable_control() allows the caller to control and investigate the
5266  * state of a purgeable object.  A purgeable object is created via a call to
5267  * vm_allocate() with VM_FLAGS_PURGABLE specified.  A purgeable object will
5268  * never be coalesced with any other object -- even other purgeable objects --
5269  * and will thus always remain a distinct object.  A purgeable object has
5270  * special semantics when its reference count is exactly 1.  If its reference
5271  * count is greater than 1, then a purgeable object will behave like a normal
5272  * object and attempts to use this interface will result in an error return
5273  * of KERN_INVALID_ARGUMENT.
5274  *
5275  * A purgeable object may be put into a "volatile" state which will make the
5276  * object's pages elligable for being reclaimed without paging to backing
5277  * store if the system runs low on memory.  If the pages in a volatile
5278  * purgeable object are reclaimed, the purgeable object is said to have been
5279  * "emptied."  When a purgeable object is emptied the system will reclaim as
5280  * many pages from the object as it can in a convenient manner (pages already
5281  * en route to backing store or busy for other reasons are left as is).  When
5282  * a purgeable object is made volatile, its pages will generally be reclaimed
5283  * before other pages in the application's working set.  This semantic is
5284  * generally used by applications which can recreate the data in the object
5285  * faster than it can be paged in.  One such example might be media assets
5286  * which can be reread from a much faster RAID volume.
5287  *
5288  * A purgeable object may be designated as "non-volatile" which means it will
5289  * behave like all other objects in the system with pages being written to and
5290  * read from backing store as needed to satisfy system memory needs.  If the
5291  * object was emptied before the object was made non-volatile, that fact will
5292  * be returned as the old state of the purgeable object (see
5293  * VM_PURGABLE_SET_STATE below).  In this case, any pages of the object which
5294  * were reclaimed as part of emptying the object will be refaulted in as
5295  * zero-fill on demand.  It is up to the application to note that an object
5296  * was emptied and recreate the objects contents if necessary.  When a
5297  * purgeable object is made non-volatile, its pages will generally not be paged
5298  * out to backing store in the immediate future.  A purgeable object may also
5299  * be manually emptied.
5300  *
5301  * Finally, the current state (non-volatile, volatile, volatile & empty) of a
5302  * volatile purgeable object may be queried at any time.  This information may
5303  * be used as a control input to let the application know when the system is
5304  * experiencing memory pressure and is reclaiming memory.
5305  *
5306  * The specified address may be any address within the purgeable object.  If
5307  * the specified address does not represent any object in the target task's
5308  * virtual address space, then KERN_INVALID_ADDRESS will be returned.  If the
5309  * object containing the specified address is not a purgeable object, then
5310  * KERN_INVALID_ARGUMENT will be returned.  Otherwise, KERN_SUCCESS will be
5311  * returned.
5312  *
5313  * The control parameter may be any one of VM_PURGABLE_SET_STATE or
5314  * VM_PURGABLE_GET_STATE.  For VM_PURGABLE_SET_STATE, the in/out parameter
5315  * state is used to set the new state of the purgeable object and return its
5316  * old state.  For VM_PURGABLE_GET_STATE, the current state of the purgeable
5317  * object is returned in the parameter state.
5318  *
5319  * The in/out parameter state may be one of VM_PURGABLE_NONVOLATILE,
5320  * VM_PURGABLE_VOLATILE or VM_PURGABLE_EMPTY.  These, respectively, represent
5321  * the non-volatile, volatile and volatile/empty states described above.
5322  * Setting the state of a purgeable object to VM_PURGABLE_EMPTY will
5323  * immediately reclaim as many pages in the object as can be conveniently
5324  * collected (some may have already been written to backing store or be
5325  * otherwise busy).
5326  *
5327  * The process of making a purgeable object non-volatile and determining its
5328  * previous state is atomic.  Thus, if a purgeable object is made
5329  * VM_PURGABLE_NONVOLATILE and the old state is returned as
5330  * VM_PURGABLE_VOLATILE, then the purgeable object's previous contents are
5331  * completely intact and will remain so until the object is made volatile
5332  * again.  If the old state is returned as VM_PURGABLE_EMPTY then the object
5333  * was reclaimed while it was in a volatile state and its previous contents
5334  * have been lost.
5335  */
5336 /*
5337  * The object must be locked.
5338  */
5339 kern_return_t
5340 vm_object_purgable_control(
5341         vm_object_t     object,
5342         vm_purgable_t   control,
5343         int             *state)
5344 {
5345         int             old_state;
5346         int             new_state;
5347
5348         if (object == VM_OBJECT_NULL) {
5349                 /*
5350                  * Object must already be present or it can't be purgeable.
5351                  */
5352                 return KERN_INVALID_ARGUMENT;
5353         }
5354
5355         /*
5356          * Get current state of the purgeable object.
5357          */
5358         old_state = object->purgable;
5359         if (old_state == VM_PURGABLE_DENY)
5360                 return KERN_INVALID_ARGUMENT;
5361
5362         /* purgeable cant have delayed copies - now or in the future */
5363         assert(object->copy == VM_OBJECT_NULL);
5364         assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
5365
5366         /*
5367          * Execute the desired operation.
5368          */
5369         if (control == VM_PURGABLE_GET_STATE) {
5370                 *state = old_state;
5371                 return KERN_SUCCESS;
5372         }
5373
5374         new_state = *state & VM_PURGABLE_STATE_MASK;
5375         switch (new_state) {
5376         case VM_PURGABLE_DENY:
5377         case VM_PURGABLE_NONVOLATILE:
5378                 object->purgable = new_state;
5379
5380                 if (old_state != VM_PURGABLE_NONVOLATILE) {
5381                         vm_page_lock_queues();
5382                         assert(vm_page_purgeable_count >=
5383                                object->resident_page_count);
5384                         vm_page_purgeable_count -= object->resident_page_count;
5385
5386                         if (old_state==VM_PURGABLE_VOLATILE) {
5387                                 assert(object->objq.next != NULL && object->objq.prev != NULL); /* object should be on a queue */
5388                                 purgeable_q_t queue = vm_purgeable_object_remove(object);
5389                                 assert(queue);
5390
5391                                 vm_purgeable_token_delete_first(queue);
5392                                 assert(queue->debug_count_objects>=0);
5393                         };
5394                         vm_page_unlock_queues();
5395                 }
5396                 break;
5397
5398         case VM_PURGABLE_VOLATILE:
5399
5400                 if ((old_state != VM_PURGABLE_NONVOLATILE) && (old_state != VM_PURGABLE_VOLATILE))
5401                         break;
5402                 purgeable_q_t queue;
5403
5404                 /* find the correct queue */
5405                 if ((*state&VM_PURGABLE_ORDERING_MASK) == VM_PURGABLE_ORDERING_OBSOLETE)
5406                         queue = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO];
5407                 else {
5408                         if ((*state&VM_PURGABLE_BEHAVIOR_MASK) == VM_PURGABLE_BEHAVIOR_FIFO)
5409                                 queue = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO];
5410                         else
5411                                 queue = &purgeable_queues[PURGEABLE_Q_TYPE_LIFO];
5412                 }
5413
5414                 if (old_state == VM_PURGABLE_NONVOLATILE) {
5415                         /* try to add token... this can fail */
5416                         vm_page_lock_queues();
5417
5418                         kern_return_t result = vm_purgeable_token_add(queue);
5419                         if (result != KERN_SUCCESS) {
5420                                 vm_page_unlock_queues();
5421                                 return result;
5422                         }
5423                         vm_page_purgeable_count += object->resident_page_count;
5424
5425                         vm_page_unlock_queues();
5426
5427                         object->purgable = new_state;
5428
5429                         /* object should not be on a queue */
5430                         assert(object->objq.next == NULL && object->objq.prev == NULL);
5431                 }
5432                 else if (old_state == VM_PURGABLE_VOLATILE) {
5433                         /*
5434                          * if reassigning priorities / purgeable groups, we don't change the
5435                          * token queue. So moving priorities will not make pages stay around longer.
5436                          * Reasoning is that the algorithm gives most priority to the most important
5437                          * object. If a new token is added, the most important object' priority is boosted.
5438                          * This biases the system already for purgeable queues that move a lot.
5439                          * It doesn't seem more biasing is neccessary in this case, where no new object is added.
5440                          */
5441                         assert(object->objq.next != NULL && object->objq.prev != NULL); /* object should be on a queue */
5442
5443                         purgeable_q_t old_queue=vm_purgeable_object_remove(object);
5444                         assert(old_queue);
5445
5446                         if (old_queue != queue) {
5447                                 kern_return_t result;
5448
5449                                 /* Changing queue. Have to move token. */
5450                                 vm_page_lock_queues();
5451                                 vm_purgeable_token_delete_first(old_queue);
5452                                 result = vm_purgeable_token_add(queue);
5453                                 vm_page_unlock_queues();
5454
5455                                 assert(result==KERN_SUCCESS);   /* this should never fail since we just freed a token */
5456                         }
5457                 };
5458                 vm_purgeable_object_add(object, queue, (*state&VM_VOLATILE_GROUP_MASK)>>VM_VOLATILE_GROUP_SHIFT );
5459
5460                 assert(queue->debug_count_objects>=0);
5461
5462                 break;
5463
5464
5465         case VM_PURGABLE_EMPTY:
5466                 if (old_state != new_state)
5467                 {
5468                         assert(old_state==VM_PURGABLE_NONVOLATILE || old_state==VM_PURGABLE_VOLATILE);
5469                         if(old_state==VM_PURGABLE_VOLATILE) {
5470                                 assert(object->objq.next != NULL && object->objq.prev != NULL); /* object should be on a queue */
5471                                 purgeable_q_t old_queue=vm_purgeable_object_remove(object);
5472                                 assert(old_queue);
5473                                 vm_page_lock_queues();
5474                                 vm_purgeable_token_delete_first(old_queue);
5475                         }
5476
5477                         if (old_state==VM_PURGABLE_NONVOLATILE) {
5478                                 vm_page_purgeable_count += object->resident_page_count;
5479                                 vm_page_lock_queues();
5480                         }
5481                         (void) vm_object_purge(object);
5482                         vm_page_unlock_queues();
5483                 }
5484                 break;
5485
5486         }
5487         *state = old_state;
5488
5489         return KERN_SUCCESS;
5490 }
5491
5492 #if     TASK_SWAPPER
5493 /*
5494  * vm_object_res_deallocate
5495  *
5496  * (recursively) decrement residence counts on vm objects and their shadows.
5497  * Called from vm_object_deallocate and when swapping out an object.
5498  *
5499  * The object is locked, and remains locked throughout the function,
5500  * even as we iterate down the shadow chain.  Locks on intermediate objects
5501  * will be dropped, but not the original object.
5502  *
5503  * NOTE: this function used to use recursion, rather than iteration.
5504  */
5505
5506 __private_extern__ void
5507 vm_object_res_deallocate(
5508         vm_object_t     object)
5509 {
5510         vm_object_t orig_object = object;
5511         /*
5512          * Object is locked so it can be called directly
5513          * from vm_object_deallocate.  Original object is never
5514          * unlocked.
5515          */
5516         assert(object->res_count > 0);
5517         while  (--object->res_count == 0) {
5518                 assert(object->ref_count >= object->res_count);
5519                 vm_object_deactivate_all_pages(object);
5520                 /* iterate on shadow, if present */
5521                 if (object->shadow != VM_OBJECT_NULL) {
5522                         vm_object_t tmp_object = object->shadow;
5523                         vm_object_lock(tmp_object);
5524                         if (object != orig_object)
5525                                 vm_object_unlock(object);
5526                         object = tmp_object;
5527                         assert(object->res_count > 0);
5528                 } else
5529                         break;
5530         }
5531         if (object != orig_object)
5532                 vm_object_unlock(object);
5533 }
5534
5535 /*
5536  * vm_object_res_reference
5537  *
5538  * Internal function to increment residence count on a vm object
5539  * and its shadows.  It is called only from vm_object_reference, and
5540  * when swapping in a vm object, via vm_map_swap.
5541  *
5542  * The object is locked, and remains locked throughout the function,
5543  * even as we iterate down the shadow chain.  Locks on intermediate objects
5544  * will be dropped, but not the original object.
5545  *
5546  * NOTE: this function used to use recursion, rather than iteration.
5547  */
5548
5549 __private_extern__ void
5550 vm_object_res_reference(
5551         vm_object_t     object)
5552 {
5553         vm_object_t orig_object = object;
5554         /*
5555          * Object is locked, so this can be called directly
5556          * from vm_object_reference.  This lock is never released.
5557          */
5558         while  ((++object->res_count == 1)  &&
5559                 (object->shadow != VM_OBJECT_NULL)) {
5560                 vm_object_t tmp_object = object->shadow;
5561
5562                 assert(object->ref_count >= object->res_count);
5563                 vm_object_lock(tmp_object);
5564                 if (object != orig_object)
5565                         vm_object_unlock(object);
5566                 object = tmp_object;
5567         }
5568         if (object != orig_object)
5569                 vm_object_unlock(object);
5570         assert(orig_object->ref_count >= orig_object->res_count);
5571 }
5572 #endif  /* TASK_SWAPPER */
5573
5574 /*
5575  *      vm_object_reference:
5576  *
5577  *      Gets another reference to the given object.
5578  */
5579 #ifdef vm_object_reference
5580 #undef vm_object_reference
5581 #endif
5582 __private_extern__ void
5583 vm_object_reference(
5584         register vm_object_t    object)
5585 {
5586         if (object == VM_OBJECT_NULL)
5587                 return;
5588
5589         vm_object_lock(object);
5590         assert(object->ref_count > 0);
5591         vm_object_reference_locked(object);
5592         vm_object_unlock(object);
5593 }
5594
5595 #ifdef MACH_BSD
5596 /*
5597  * Scale the vm_object_cache
5598  * This is required to make sure that the vm_object_cache is big
5599  * enough to effectively cache the mapped file.
5600  * This is really important with UBC as all the regular file vnodes
5601  * have memory object associated with them. Havving this cache too
5602  * small results in rapid reclaim of vnodes and hurts performance a LOT!
5603  *
5604  * This is also needed as number of vnodes can be dynamically scaled.
5605  */
5606 kern_return_t
5607 adjust_vm_object_cache(
5608         __unused vm_size_t oval,
5609         vm_size_t nval)
5610 {
5611         vm_object_cached_max = nval;
5612         vm_object_cache_trim(FALSE);
5613         return (KERN_SUCCESS);
5614 }
5615 #endif /* MACH_BSD */
5616
5617
5618 /*
5619  * vm_object_transpose
5620  *
5621  * This routine takes two VM objects of the same size and exchanges
5622  * their backing store.
5623  * The objects should be "quiesced" via a UPL operation with UPL_SET_IO_WIRE
5624  * and UPL_BLOCK_ACCESS if they are referenced anywhere.
5625  *
5626  * The VM objects must not be locked by caller.
5627  */
5628 kern_return_t
5629 vm_object_transpose(
5630         vm_object_t             object1,
5631         vm_object_t             object2,
5632         vm_object_size_t        transpose_size)
5633 {
5634         vm_object_t             tmp_object;
5635         kern_return_t           retval;
5636         boolean_t               object1_locked, object2_locked;
5637         boolean_t               object1_paging, object2_paging;
5638         vm_page_t               page;
5639         vm_object_offset_t      page_offset;
5640
5641         tmp_object = VM_OBJECT_NULL;
5642         object1_locked = FALSE; object2_locked = FALSE;
5643         object1_paging = FALSE; object2_paging = FALSE;
5644
5645         if (object1 == object2 ||
5646             object1 == VM_OBJECT_NULL ||
5647             object2 == VM_OBJECT_NULL) {
5648                 /*
5649                  * If the 2 VM objects are the same, there's
5650                  * no point in exchanging their backing store.
5651                  */
5652                 retval = KERN_INVALID_VALUE;
5653                 goto done;
5654         }
5655
5656         vm_object_lock(object1);
5657         object1_locked = TRUE;
5658         if (!object1->alive || object1->terminating ||
5659             object1->copy || object1->shadow || object1->shadowed ||
5660             object1->purgable != VM_PURGABLE_DENY) {
5661                 /*
5662                  * We don't deal with copy or shadow objects (yet).
5663                  */
5664                 retval = KERN_INVALID_VALUE;
5665                 goto done;
5666         }
5667         /*
5668          * Since we're about to mess with the object's backing store,
5669          * mark it as "paging_in_progress".  Note that this is not enough
5670          * to prevent any paging activity on this object, so the caller should
5671          * have "quiesced" the objects beforehand, via a UPL operation with
5672          * UPL_SET_IO_WIRE (to make sure all the pages are there and wired)
5673          * and UPL_BLOCK_ACCESS (to mark the pages "busy").
5674          */
5675         vm_object_paging_begin(object1);
5676         object1_paging = TRUE;
5677         vm_object_unlock(object1);
5678         object1_locked = FALSE;
5679
5680         /*
5681          * Same as above for the 2nd object...
5682          */
5683         vm_object_lock(object2);
5684         object2_locked = TRUE;
5685         if (! object2->alive || object2->terminating ||
5686             object2->copy || object2->shadow || object2->shadowed ||
5687             object2->purgable != VM_PURGABLE_DENY) {
5688                 retval = KERN_INVALID_VALUE;
5689                 goto done;
5690         }
5691         vm_object_paging_begin(object2);
5692         object2_paging = TRUE;
5693         vm_object_unlock(object2);
5694         object2_locked = FALSE;
5695
5696         /*
5697          * Allocate a temporary VM object to hold object1's contents
5698          * while we copy object2 to object1.
5699          */
5700         tmp_object = vm_object_allocate(transpose_size);
5701         vm_object_lock(tmp_object);
5702         vm_object_paging_begin(tmp_object);
5703         tmp_object->can_persist = FALSE;
5704
5705         /*
5706          * Since we need to lock both objects at the same time,
5707          * make sure we always lock them in the same order to
5708          * avoid deadlocks.
5709          */
5710         if (object1 < object2) {
5711                 vm_object_lock(object1);
5712                 vm_object_lock(object2);
5713         } else {
5714                 vm_object_lock(object2);
5715                 vm_object_lock(object1);
5716         }
5717         object1_locked = TRUE;
5718         object2_locked = TRUE;
5719
5720         if (object1->size != object2->size ||
5721             object1->size != transpose_size) {
5722                 /*
5723                  * If the 2 objects don't have the same size, we can't
5724                  * exchange their backing stores or one would overflow.
5725                  * If their size doesn't match the caller's
5726                  * "transpose_size", we can't do it either because the
5727                  * transpose operation will affect the entire span of
5728                  * the objects.
5729                  */
5730                 retval = KERN_INVALID_VALUE;
5731                 goto done;
5732         }
5733
5734
5735         /*
5736          * Transpose the lists of resident pages.
5737          * This also updates the resident_page_count and the memq_hint.
5738          */
5739         if (object1->phys_contiguous || queue_empty(&object1->memq)) {
5740                 /*
5741                  * No pages in object1, just transfer pages
5742                  * from object2 to object1.  No need to go through
5743                  * an intermediate object.
5744                  */
5745                 while (!queue_empty(&object2->memq)) {
5746                         page = (vm_page_t) queue_first(&object2->memq);
5747                         vm_page_rename(page, object1, page->offset, FALSE);
5748                 }
5749                 assert(queue_empty(&object2->memq));
5750         } else if (object2->phys_contiguous || queue_empty(&object2->memq)) {
5751                 /*
5752                  * No pages in object2, just transfer pages
5753                  * from object1 to object2.  No need to go through
5754                  * an intermediate object.
5755                  */
5756                 while (!queue_empty(&object1->memq)) {
5757                         page = (vm_page_t) queue_first(&object1->memq);
5758                         vm_page_rename(page, object2, page->offset, FALSE);
5759                 }
5760                 assert(queue_empty(&object1->memq));
5761         } else {
5762                 /* transfer object1's pages to tmp_object */
5763                 vm_page_lock_queues();
5764                 while (!queue_empty(&object1->memq)) {
5765                         page = (vm_page_t) queue_first(&object1->memq);
5766                         page_offset = page->offset;
5767                         vm_page_remove(page);
5768                         page->offset = page_offset;
5769                         queue_enter(&tmp_object->memq, page, vm_page_t, listq);
5770                 }
5771                 vm_page_unlock_queues();
5772                 assert(queue_empty(&object1->memq));
5773                 /* transfer object2's pages to object1 */
5774                 while (!queue_empty(&object2->memq)) {
5775                         page = (vm_page_t) queue_first(&object2->memq);
5776                         vm_page_rename(page, object1, page->offset, FALSE);
5777                 }
5778                 assert(queue_empty(&object2->memq));
5779                 /* transfer tmp_object's pages to object1 */
5780                 while (!queue_empty(&tmp_object->memq)) {
5781                         page = (vm_page_t) queue_first(&tmp_object->memq);
5782                         queue_remove(&tmp_object->memq, page,
5783                                      vm_page_t, listq);
5784                         vm_page_insert(page, object2, page->offset);
5785                 }
5786                 assert(queue_empty(&tmp_object->memq));
5787         }
5788
5789 #define __TRANSPOSE_FIELD(field)                                \
5790 MACRO_BEGIN                                                     \
5791         tmp_object->field = object1->field;                     \
5792         object1->field = object2->field;                        \
5793         object2->field = tmp_object->field;                     \
5794 MACRO_END
5795
5796         /* "size" should be identical */
5797         assert(object1->size == object2->size);
5798         /* "Lock" refers to the object not its contents */
5799         /* "ref_count" refers to the object not its contents */
5800 #if TASK_SWAPPER
5801         /* "res_count" refers to the object not its contents */
5802 #endif
5803         /* "resident_page_count" was updated above when transposing pages */
5804         /* there should be no "copy" */
5805         assert(!object1->copy);
5806         assert(!object2->copy);
5807         /* there should be no "shadow" */
5808         assert(!object1->shadow);
5809         assert(!object2->shadow);
5810         __TRANSPOSE_FIELD(shadow_offset); /* used by phys_contiguous objects */
5811         __TRANSPOSE_FIELD(pager);
5812         __TRANSPOSE_FIELD(paging_offset);
5813         __TRANSPOSE_FIELD(pager_control);
5814         /* update the memory_objects' pointers back to the VM objects */
5815         if (object1->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
5816                 memory_object_control_collapse(object1->pager_control,
5817                                                object1);
5818         }
5819         if (object2->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
5820                 memory_object_control_collapse(object2->pager_control,
5821                                                object2);
5822         }
5823         __TRANSPOSE_FIELD(copy_strategy);
5824         /* "paging_in_progress" refers to the object not its contents */
5825         assert(object1->paging_in_progress);
5826         assert(object2->paging_in_progress);
5827         /* "all_wanted" refers to the object not its contents */
5828         __TRANSPOSE_FIELD(pager_created);
5829         __TRANSPOSE_FIELD(pager_initialized);
5830         __TRANSPOSE_FIELD(pager_ready);
5831         __TRANSPOSE_FIELD(pager_trusted);
5832         __TRANSPOSE_FIELD(can_persist);
5833         __TRANSPOSE_FIELD(internal);
5834         __TRANSPOSE_FIELD(temporary);
5835         __TRANSPOSE_FIELD(private);
5836         __TRANSPOSE_FIELD(pageout);
5837         /* "alive" should be set */
5838         assert(object1->alive);
5839         assert(object2->alive);
5840         /* "purgeable" should be non-purgeable */
5841         assert(object1->purgable == VM_PURGABLE_DENY);
5842         assert(object2->purgable == VM_PURGABLE_DENY);
5843         /* "shadowed" refers to the the object not its contents */
5844         __TRANSPOSE_FIELD(silent_overwrite);
5845         __TRANSPOSE_FIELD(advisory_pageout);
5846         __TRANSPOSE_FIELD(true_share);
5847         /* "terminating" should not be set */
5848         assert(!object1->terminating);
5849         assert(!object2->terminating);
5850         __TRANSPOSE_FIELD(named);
5851         /* "shadow_severed" refers to the object not its contents */
5852         __TRANSPOSE_FIELD(phys_contiguous);
5853         __TRANSPOSE_FIELD(nophyscache);
5854         /* "cached_list" should be NULL */
5855         assert(object1->cached_list.prev == NULL);
5856         assert(object1->cached_list.next == NULL);
5857         assert(object2->cached_list.prev == NULL);
5858         assert(object2->cached_list.next == NULL);
5859         /* "msr_q" is linked to the object not its contents */
5860         assert(queue_empty(&object1->msr_q));
5861         assert(queue_empty(&object2->msr_q));
5862         __TRANSPOSE_FIELD(last_alloc);
5863         __TRANSPOSE_FIELD(sequential);
5864         __TRANSPOSE_FIELD(pages_created);
5865         __TRANSPOSE_FIELD(pages_used);
5866 #if MACH_PAGEMAP
5867         __TRANSPOSE_FIELD(existence_map);
5868 #endif
5869         __TRANSPOSE_FIELD(cow_hint);
5870 #if MACH_ASSERT
5871         __TRANSPOSE_FIELD(paging_object);
5872 #endif
5873         __TRANSPOSE_FIELD(wimg_bits);
5874         __TRANSPOSE_FIELD(code_signed);
5875         __TRANSPOSE_FIELD(not_in_use);
5876 #ifdef UPL_DEBUG
5877         /* "uplq" refers to the object not its contents (see upl_transpose()) */
5878 #endif
5879
5880 #undef __TRANSPOSE_FIELD
5881
5882         retval = KERN_SUCCESS;
5883
5884 done:
5885         /*
5886          * Cleanup.
5887          */
5888         if (tmp_object != VM_OBJECT_NULL) {
5889                 vm_object_paging_end(tmp_object);
5890                 vm_object_unlock(tmp_object);
5891                 /*
5892                  * Re-initialize the temporary object to avoid
5893                  * deallocating a real pager.
5894                  */
5895                 _vm_object_allocate(transpose_size, tmp_object);
5896                 vm_object_deallocate(tmp_object);
5897                 tmp_object = VM_OBJECT_NULL;
5898         }
5899
5900         if (object1_locked) {
5901                 vm_object_unlock(object1);
5902                 object1_locked = FALSE;
5903         }
5904         if (object2_locked) {
5905                 vm_object_unlock(object2);
5906                 object2_locked = FALSE;
5907         }
5908         if (object1_paging) {
5909                 vm_object_lock(object1);
5910                 vm_object_paging_end(object1);
5911                 vm_object_unlock(object1);
5912                 object1_paging = FALSE;
5913         }
5914         if (object2_paging) {
5915                 vm_object_lock(object2);
5916                 vm_object_paging_end(object2);
5917                 vm_object_unlock(object2);
5918                 object2_paging = FALSE;
5919         }
5920
5921         return retval;
5922 }
5923
5924
5925 /*
5926  *      vm_object_build_cluster
5927  *
5928  *      Determine how big a cluster we should issue an I/O for...
5929  *
5930  *      Inputs:   *start == offset of page needed
5931  *                *length == maximum cluster pager can handle
5932  *      Outputs:  *start == beginning offset of cluster
5933  *                *length == length of cluster to try
5934  *
5935  *      The original *start will be encompassed by the cluster
5936  *
5937  */
5938 extern int speculative_reads_disabled;
5939
5940 uint32_t pre_heat_scaling[MAX_UPL_TRANSFER];
5941 uint32_t pre_heat_cluster[MAX_UPL_TRANSFER];
5942
5943 #define PRE_HEAT_MULTIPLIER 4
5944
5945 __private_extern__ void
5946 vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start,
5947                        vm_size_t *length, vm_object_fault_info_t fault_info)
5948 {
5949         vm_size_t               pre_heat_size;
5950         vm_size_t               tail_size;
5951         vm_size_t               head_size;
5952         vm_size_t               max_length;
5953         vm_size_t               cluster_size;
5954         vm_object_offset_t      object_size;
5955         vm_object_offset_t      orig_start;
5956         vm_object_offset_t      target_start;
5957         vm_object_offset_t      offset;
5958         vm_behavior_t           behavior;
5959         boolean_t               look_behind = TRUE;
5960         boolean_t               look_ahead  = TRUE;
5961         int                     sequential_run;
5962         int                     sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
5963
5964         assert( !(*length & PAGE_MASK));
5965         assert( !(*start & PAGE_MASK_64));
5966
5967         if ( (max_length = *length) > (MAX_UPL_TRANSFER * PAGE_SIZE) )
5968                 max_length = (MAX_UPL_TRANSFER * PAGE_SIZE);
5969         /*
5970          * we'll always return a cluster size of at least
5971          * 1 page, since the original fault must always
5972          * be processed
5973          */
5974         *length = PAGE_SIZE;
5975
5976         if (speculative_reads_disabled || fault_info == NULL || max_length == 0) {
5977                 /*
5978                  * no cluster... just fault the page in
5979                  */
5980                 return;
5981         }
5982         orig_start = *start;
5983         target_start = orig_start;
5984         cluster_size = round_page_32(fault_info->cluster_size);
5985         behavior = fault_info->behavior;
5986
5987         vm_object_lock(object);
5988
5989         if (object->internal)
5990                 object_size = object->size;
5991         else if (object->pager != MEMORY_OBJECT_NULL)
5992                 vnode_pager_get_object_size(object->pager, &object_size);
5993         else
5994                 goto out;       /* pager is gone for this object, nothing more to do */
5995
5996         object_size = round_page_64(object_size);
5997
5998         if (orig_start >= object_size) {
5999                 /*
6000                  * fault occurred beyond the EOF...
6001                  * we need to punt w/o changing the
6002                  * starting offset
6003                  */
6004                 goto out;
6005         }
6006         if (object->pages_used > object->pages_created) {
6007                 /*
6008                  * must have wrapped our 32 bit counters
6009                  * so reset
6010                  */
6011                 object->pages_used = object->pages_created = 0;
6012         }
6013         if ((sequential_run = object->sequential)) {
6014                   if (sequential_run < 0) {
6015                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
6016                           sequential_run = 0 - sequential_run;
6017                   } else {
6018                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
6019                   }
6020         }
6021         switch(behavior) {
6022
6023         default:
6024                 behavior = VM_BEHAVIOR_DEFAULT;
6025
6026         case VM_BEHAVIOR_DEFAULT:
6027                 if (object->internal && fault_info->user_tag == VM_MEMORY_STACK)
6028                         goto out;
6029
6030                 if (sequential_run >= (3 * PAGE_SIZE)) {
6031                         pre_heat_size = sequential_run + PAGE_SIZE;
6032
6033                         if ((behavior = sequential_behavior) == VM_BEHAVIOR_SEQUENTIAL)
6034                                 look_behind = FALSE;
6035                         else
6036                                 look_ahead = FALSE;
6037                 } else {
6038                         uint32_t        pages_unused;
6039
6040                         if (object->pages_created < 32 * PRE_HEAT_MULTIPLIER) {
6041                                 /*
6042                                  * prime the pump
6043                                  */
6044                                 pre_heat_size = PAGE_SIZE * 8 * PRE_HEAT_MULTIPLIER;
6045                                 break;
6046                         }
6047                         pages_unused = object->pages_created - object->pages_used;
6048
6049                         if (pages_unused < (object->pages_created / 8)) {
6050                                 pre_heat_size = PAGE_SIZE * 32 * PRE_HEAT_MULTIPLIER;
6051                         } else if (pages_unused < (object->pages_created / 4)) {
6052                                 pre_heat_size = PAGE_SIZE * 16 * PRE_HEAT_MULTIPLIER;
6053                         } else if (pages_unused < (object->pages_created / 2)) {
6054                                 pre_heat_size = PAGE_SIZE * 8 * PRE_HEAT_MULTIPLIER;
6055                         } else {
6056                                 pre_heat_size = PAGE_SIZE * 4 * PRE_HEAT_MULTIPLIER;
6057                         }
6058                 }
6059                 break;
6060
6061         case VM_BEHAVIOR_RANDOM:
6062                 if ((pre_heat_size = cluster_size) <= PAGE_SIZE)
6063                         goto out;
6064                 break;
6065
6066         case VM_BEHAVIOR_SEQUENTIAL:
6067                 if ((pre_heat_size = cluster_size) == 0)
6068                         pre_heat_size = sequential_run + PAGE_SIZE;
6069                 look_behind = FALSE;
6070
6071                 break;
6072
6073         case VM_BEHAVIOR_RSEQNTL:
6074                 if ((pre_heat_size = cluster_size) == 0)
6075                         pre_heat_size = sequential_run + PAGE_SIZE;
6076                 look_ahead = FALSE;
6077
6078                 break;
6079
6080         }
6081         if (pre_heat_size > max_length)
6082                 pre_heat_size = max_length;
6083
6084         if (behavior == VM_BEHAVIOR_DEFAULT && vm_page_free_count < vm_page_free_target)
6085                 pre_heat_size /= 2;
6086
6087         if (look_ahead == TRUE) {
6088                 if (look_behind == TRUE)
6089                         target_start &= ~(pre_heat_size - 1);
6090
6091                 if ((target_start + pre_heat_size) > object_size)
6092                         pre_heat_size = (vm_size_t)(trunc_page_64(object_size - target_start));
6093
6094                 tail_size = pre_heat_size - (orig_start - target_start) - PAGE_SIZE;
6095         } else {
6096                 if (pre_heat_size > target_start)
6097                         pre_heat_size = target_start;
6098                 tail_size = 0;
6099         }
6100         pre_heat_scaling[pre_heat_size / PAGE_SIZE]++;
6101
6102         if (pre_heat_size <= PAGE_SIZE)
6103                 goto out;
6104
6105         if (look_behind == TRUE) {
6106                 /*
6107                  * take a look at the pages before the original
6108                  * faulting offset
6109                  */
6110                 head_size = pre_heat_size - tail_size - PAGE_SIZE;
6111
6112                 for (offset = orig_start - PAGE_SIZE_64; head_size; offset -= PAGE_SIZE_64, head_size -= PAGE_SIZE) {
6113                         /*
6114                          * don't poke below the lowest offset
6115                          */
6116                         if (offset < fault_info->lo_offset)
6117                                 break;
6118                         /*
6119                          * for external objects and internal objects w/o an existence map
6120                          * vm_externl_state_get will return VM_EXTERNAL_STATE_UNKNOWN
6121                          */
6122 #if MACH_PAGEMAP
6123                         if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_ABSENT) {
6124                                 /*
6125                                  * we know for a fact that the pager can't provide the page
6126                                  * so don't include it or any pages beyond it in this cluster
6127                                  */
6128                                 break;
6129                         }
6130 #endif
6131                         if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
6132                                 /*
6133                                  * don't bridge resident pages
6134                                  */
6135                                 break;
6136                         }
6137                         *start = offset;
6138                         *length += PAGE_SIZE;
6139                 }
6140         }
6141         if (look_ahead == TRUE) {
6142                 for (offset = orig_start + PAGE_SIZE_64; tail_size; offset += PAGE_SIZE_64, tail_size -= PAGE_SIZE) {
6143                         /*
6144                          * don't poke above the highest offset
6145                          */
6146                         if (offset >= fault_info->hi_offset)
6147                                 break;
6148                         /*
6149                          * for external objects and internal objects w/o an existence map
6150                          * vm_externl_state_get will return VM_EXTERNAL_STATE_UNKNOWN
6151                          */
6152 #if MACH_PAGEMAP
6153                         if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_ABSENT) {
6154                                 /*
6155                                  * we know for a fact that the pager can't provide the page
6156                                  * so don't include it or any pages beyond it in this cluster
6157                                  */
6158                                 break;
6159                         }
6160 #endif
6161                         if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
6162                                 /*
6163                                  * don't bridge resident pages
6164                                  */
6165                                 break;
6166                         }
6167                         *length += PAGE_SIZE;
6168                 }
6169         }
6170 out:
6171         pre_heat_cluster[*length / PAGE_SIZE]++;
6172
6173         vm_object_unlock(object);
6174 }
6175
6176
6177 /*
6178  * Allow manipulation of individual page state.  This is actually part of
6179  * the UPL regimen but takes place on the VM object rather than on a UPL
6180  */
6181
6182 kern_return_t
6183 vm_object_page_op(
6184         vm_object_t             object,
6185         vm_object_offset_t      offset,
6186         int                     ops,
6187         ppnum_t                 *phys_entry,
6188         int                     *flags)
6189 {
6190         vm_page_t               dst_page;
6191
6192         vm_object_lock(object);
6193
6194         if(ops & UPL_POP_PHYSICAL) {
6195                 if(object->phys_contiguous) {
6196                         if (phys_entry) {
6197                                 *phys_entry = (ppnum_t)
6198                                         (object->shadow_offset >> 12);
6199                         }
6200                         vm_object_unlock(object);
6201                         return KERN_SUCCESS;
6202                 } else {
6203                         vm_object_unlock(object);
6204                         return KERN_INVALID_OBJECT;
6205                 }
6206         }
6207         if(object->phys_contiguous) {
6208                 vm_object_unlock(object);
6209                 return KERN_INVALID_OBJECT;
6210         }
6211
6212         while(TRUE) {
6213                 if((dst_page = vm_page_lookup(object,offset)) == VM_PAGE_NULL) {
6214                         vm_object_unlock(object);
6215                         return KERN_FAILURE;
6216                 }
6217
6218                 /* Sync up on getting the busy bit */
6219                 if((dst_page->busy || dst_page->cleaning) &&
6220                            (((ops & UPL_POP_SET) &&
6221                            (ops & UPL_POP_BUSY)) || (ops & UPL_POP_DUMP))) {
6222                         /* someone else is playing with the page, we will */
6223                         /* have to wait */
6224                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6225                         continue;
6226                 }
6227
6228                 if (ops & UPL_POP_DUMP) {
6229                         if (dst_page->pmapped == TRUE)
6230                                 pmap_disconnect(dst_page->phys_page);
6231
6232                         vm_page_lock_queues();
6233                         vm_page_free(dst_page);
6234                         vm_page_unlock_queues();
6235
6236                         break;
6237                 }
6238
6239                 if (flags) {
6240                         *flags = 0;
6241
6242                         /* Get the condition of flags before requested ops */
6243                         /* are undertaken */
6244
6245                         if(dst_page->dirty) *flags |= UPL_POP_DIRTY;
6246                         if(dst_page->pageout) *flags |= UPL_POP_PAGEOUT;
6247                         if(dst_page->precious) *flags |= UPL_POP_PRECIOUS;
6248                         if(dst_page->absent) *flags |= UPL_POP_ABSENT;
6249                         if(dst_page->busy) *flags |= UPL_POP_BUSY;
6250                 }
6251
6252                 /* The caller should have made a call either contingent with */
6253                 /* or prior to this call to set UPL_POP_BUSY */
6254                 if(ops & UPL_POP_SET) {
6255                         /* The protection granted with this assert will */
6256                         /* not be complete.  If the caller violates the */
6257                         /* convention and attempts to change page state */
6258                         /* without first setting busy we may not see it */
6259                         /* because the page may already be busy.  However */
6260                         /* if such violations occur we will assert sooner */
6261                         /* or later. */
6262                         assert(dst_page->busy || (ops & UPL_POP_BUSY));
6263                         if (ops & UPL_POP_DIRTY) dst_page->dirty = TRUE;
6264                         if (ops & UPL_POP_PAGEOUT) dst_page->pageout = TRUE;
6265                         if (ops & UPL_POP_PRECIOUS) dst_page->precious = TRUE;
6266                         if (ops & UPL_POP_ABSENT) dst_page->absent = TRUE;
6267                         if (ops & UPL_POP_BUSY) dst_page->busy = TRUE;
6268                 }
6269
6270                 if(ops & UPL_POP_CLR) {
6271                         assert(dst_page->busy);
6272                         if (ops & UPL_POP_DIRTY) dst_page->dirty = FALSE;
6273                         if (ops & UPL_POP_PAGEOUT) dst_page->pageout = FALSE;
6274                         if (ops & UPL_POP_PRECIOUS) dst_page->precious = FALSE;
6275                         if (ops & UPL_POP_ABSENT) dst_page->absent = FALSE;
6276                         if (ops & UPL_POP_BUSY) {
6277                                 dst_page->busy = FALSE;
6278                                 PAGE_WAKEUP(dst_page);
6279                         }
6280                 }
6281
6282                 if (dst_page->encrypted) {
6283                         /*
6284                          * ENCRYPTED SWAP:
6285                          * We need to decrypt this encrypted page before the
6286                          * caller can access its contents.
6287                          * But if the caller really wants to access the page's
6288                          * contents, they have to keep the page "busy".
6289                          * Otherwise, the page could get recycled or re-encrypted
6290                          * at any time.
6291                          */
6292                         if ((ops & UPL_POP_SET) && (ops & UPL_POP_BUSY) &&
6293                             dst_page->busy) {
6294                                 /*
6295                                  * The page is stable enough to be accessed by
6296                                  * the caller, so make sure its contents are
6297                                  * not encrypted.
6298                                  */
6299                                 vm_page_decrypt(dst_page, 0);
6300                         } else {
6301                                 /*
6302                                  * The page is not busy, so don't bother
6303                                  * decrypting it, since anything could
6304                                  * happen to it between now and when the
6305                                  * caller wants to access it.
6306                                  * We should not give the caller access
6307                                  * to this page.
6308                                  */
6309                                 assert(!phys_entry);
6310                         }
6311                 }
6312
6313                 if (phys_entry) {
6314                         /*
6315                          * The physical page number will remain valid
6316                          * only if the page is kept busy.
6317                          * ENCRYPTED SWAP: make sure we don't let the
6318                          * caller access an encrypted page.
6319                          */
6320                         assert(dst_page->busy);
6321                         assert(!dst_page->encrypted);
6322                         *phys_entry = dst_page->phys_page;
6323                 }
6324
6325                 break;
6326         }
6327
6328         vm_object_unlock(object);
6329         return KERN_SUCCESS;
6330
6331 }
6332
6333 /*
6334  * vm_object_range_op offers performance enhancement over
6335  * vm_object_page_op for page_op functions which do not require page
6336  * level state to be returned from the call.  Page_op was created to provide
6337  * a low-cost alternative to page manipulation via UPLs when only a single
6338  * page was involved.  The range_op call establishes the ability in the _op
6339  * family of functions to work on multiple pages where the lack of page level
6340  * state handling allows the caller to avoid the overhead of the upl structures.
6341  */
6342
6343 kern_return_t
6344 vm_object_range_op(
6345         vm_object_t             object,
6346         vm_object_offset_t      offset_beg,
6347         vm_object_offset_t      offset_end,
6348         int                     ops,
6349         int                     *range)
6350 {
6351         vm_object_offset_t      offset;
6352         vm_page_t               dst_page;
6353
6354         if (object->resident_page_count == 0) {
6355                 if (range) {
6356                         if (ops & UPL_ROP_PRESENT)
6357                                 *range = 0;
6358                         else
6359                                 *range = offset_end - offset_beg;
6360                 }
6361                 return KERN_SUCCESS;
6362         }
6363         vm_object_lock(object);
6364
6365         if (object->phys_contiguous) {
6366                 vm_object_unlock(object);
6367                 return KERN_INVALID_OBJECT;
6368         }
6369
6370         offset = offset_beg & ~PAGE_MASK_64;
6371
6372         while (offset < offset_end) {
6373                 dst_page = vm_page_lookup(object, offset);
6374                 if (dst_page != VM_PAGE_NULL) {
6375                         if (ops & UPL_ROP_DUMP) {
6376                                 if (dst_page->busy || dst_page->cleaning) {
6377                                         /*
6378                                          * someone else is playing with the
6379                                          * page, we will have to wait
6380                                          */
6381                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6382                                         /*
6383                                          * need to relook the page up since it's
6384                                          * state may have changed while we slept
6385                                          * it might even belong to a different object
6386                                          * at this point
6387                                          */
6388                                         continue;
6389                                 }
6390                                 if (dst_page->pmapped == TRUE)
6391                                         pmap_disconnect(dst_page->phys_page);
6392
6393                                 vm_page_lock_queues();
6394                                 vm_page_free(dst_page);
6395                                 vm_page_unlock_queues();
6396
6397                         } else if (ops & UPL_ROP_ABSENT)
6398                                 break;
6399                 } else if (ops & UPL_ROP_PRESENT)
6400                         break;
6401
6402                 offset += PAGE_SIZE;
6403         }
6404         vm_object_unlock(object);
6405
6406         if (range) {
6407                 if (offset > offset_end)
6408                         offset = offset_end;
6409                 if(offset > offset_beg)
6410                         *range = offset - offset_beg;
6411                 else *range=0;
6412         }
6413         return KERN_SUCCESS;
6414 }
6415
6416
6417 uint32_t scan_object_collision = 0;
6418
6419 void
6420 vm_object_lock(vm_object_t object)
6421 {
6422         if (object == vm_pageout_scan_wants_object) {
6423                 scan_object_collision++;
6424                 mutex_pause(2);
6425         }
6426         lck_rw_lock_exclusive(&object->Lock);
6427 }
6428
6429 boolean_t
6430 vm_object_lock_try(vm_object_t object)
6431 {
6432         if (object == vm_pageout_scan_wants_object) {
6433                 scan_object_collision++;
6434                 mutex_pause(2);
6435         }
6436         return (lck_rw_try_lock_exclusive(&object->Lock));
6437 }
6438
6439 void
6440 vm_object_lock_shared(vm_object_t object)
6441 {
6442         if (object == vm_pageout_scan_wants_object) {
6443                 scan_object_collision++;
6444                 mutex_pause(2);
6445         }
6446         lck_rw_lock_shared(&object->Lock);
6447 }
6448
6449 boolean_t
6450 vm_object_lock_try_shared(vm_object_t object)
6451 {
6452         if (object == vm_pageout_scan_wants_object) {
6453                 scan_object_collision++;
6454                 mutex_pause(2);
6455         }
6456         return (lck_rw_try_lock_shared(&object->Lock));
6457 }