osfmk/vm/vm_object.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_object.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Virtual memory object module.
  63  */
  64
  65 #include <debug.h>
  66 #include <mach_pagemap.h>
  67 #include <task_swapper.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/memory_object.h>
  71 #include <mach/memory_object_default.h>
  72 #include <mach/memory_object_control_server.h>
  73 #include <mach/vm_param.h>
  74
  75 #include <ipc/ipc_types.h>
  76 #include <ipc/ipc_port.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/assert.h>
  80 #include <kern/lock.h>
  81 #include <kern/queue.h>
  82 #include <kern/xpr.h>
  83 #include <kern/zalloc.h>
  84 #include <kern/host.h>
  85 #include <kern/host_statistics.h>
  86 #include <kern/processor.h>
  87 #include <kern/misc_protos.h>
  88
  89 #include <vm/memory_object.h>
  90 #include <vm/vm_fault.h>
  91 #include <vm/vm_map.h>
  92 #include <vm/vm_object.h>
  93 #include <vm/vm_page.h>
  94 #include <vm/vm_pageout.h>
  95 #include <vm/vm_protos.h>
  96 #include <vm/vm_purgeable_internal.h>
  97
  98 #if CONFIG_EMBEDDED
  99 #include <sys/kern_memorystatus.h>
 100 #endif
 101
 102 /*
 103  *      Virtual memory objects maintain the actual data
 104  *      associated with allocated virtual memory.  A given
 105  *      page of memory exists within exactly one object.
 106  *
 107  *      An object is only deallocated when all "references"
 108  *      are given up.
 109  *
 110  *      Associated with each object is a list of all resident
 111  *      memory pages belonging to that object; this list is
 112  *      maintained by the "vm_page" module, but locked by the object's
 113  *      lock.
 114  *
 115  *      Each object also records the memory object reference
 116  *      that is used by the kernel to request and write
 117  *      back data (the memory object, field "pager"), etc...
 118  *
 119  *      Virtual memory objects are allocated to provide
 120  *      zero-filled memory (vm_allocate) or map a user-defined
 121  *      memory object into a virtual address space (vm_map).
 122  *
 123  *      Virtual memory objects that refer to a user-defined
 124  *      memory object are called "permanent", because all changes
 125  *      made in virtual memory are reflected back to the
 126  *      memory manager, which may then store it permanently.
 127  *      Other virtual memory objects are called "temporary",
 128  *      meaning that changes need be written back only when
 129  *      necessary to reclaim pages, and that storage associated
 130  *      with the object can be discarded once it is no longer
 131  *      mapped.
 132  *
 133  *      A permanent memory object may be mapped into more
 134  *      than one virtual address space.  Moreover, two threads
 135  *      may attempt to make the first mapping of a memory
 136  *      object concurrently.  Only one thread is allowed to
 137  *      complete this mapping; all others wait for the
 138  *      "pager_initialized" field is asserted, indicating
 139  *      that the first thread has initialized all of the
 140  *      necessary fields in the virtual memory object structure.
 141  *
 142  *      The kernel relies on a *default memory manager* to
 143  *      provide backing storage for the zero-filled virtual
 144  *      memory objects.  The pager memory objects associated
 145  *      with these temporary virtual memory objects are only
 146  *      requested from the default memory manager when it
 147  *      becomes necessary.  Virtual memory objects
 148  *      that depend on the default memory manager are called
 149  *      "internal".  The "pager_created" field is provided to
 150  *      indicate whether these ports have ever been allocated.
 151  *
 152  *      The kernel may also create virtual memory objects to
 153  *      hold changed pages after a copy-on-write operation.
 154  *      In this case, the virtual memory object (and its
 155  *      backing storage -- its memory object) only contain
 156  *      those pages that have been changed.  The "shadow"
 157  *      field refers to the virtual memory object that contains
 158  *      the remainder of the contents.  The "shadow_offset"
 159  *      field indicates where in the "shadow" these contents begin.
 160  *      The "copy" field refers to a virtual memory object
 161  *      to which changed pages must be copied before changing
 162  *      this object, in order to implement another form
 163  *      of copy-on-write optimization.
 164  *
 165  *      The virtual memory object structure also records
 166  *      the attributes associated with its memory object.
 167  *      The "pager_ready", "can_persist" and "copy_strategy"
 168  *      fields represent those attributes.  The "cached_list"
 169  *      field is used in the implementation of the persistence
 170  *      attribute.
 171  *
 172  * ZZZ Continue this comment.
 173  */
 174
 175 /* Forward declarations for internal functions. */
 176 static kern_return_t    vm_object_terminate(
 177                                 vm_object_t     object);
 178
 179 extern void             vm_object_remove(
 180                                 vm_object_t     object);
 181
 182 static kern_return_t    vm_object_copy_call(
 183                                 vm_object_t             src_object,
 184                                 vm_object_offset_t      src_offset,
 185                                 vm_object_size_t        size,
 186                                 vm_object_t             *_result_object);
 187
 188 static void             vm_object_do_collapse(
 189                                 vm_object_t     object,
 190                                 vm_object_t     backing_object);
 191
 192 static void             vm_object_do_bypass(
 193                                 vm_object_t     object,
 194                                 vm_object_t     backing_object);
 195
 196 static void             vm_object_release_pager(
 197                                 memory_object_t pager,
 198                                 boolean_t       hashed);
 199
 200 static zone_t           vm_object_zone;         /* vm backing store zone */
 201
 202 /*
 203  *      All wired-down kernel memory belongs to a single virtual
 204  *      memory object (kernel_object) to avoid wasting data structures.
 205  */
 206 static struct vm_object                 kernel_object_store;
 207 vm_object_t                                             kernel_object;
 208
 209
 210 /*
 211  *      The submap object is used as a placeholder for vm_map_submap
 212  *      operations.  The object is declared in vm_map.c because it
 213  *      is exported by the vm_map module.  The storage is declared
 214  *      here because it must be initialized here.
 215  */
 216 static struct vm_object                 vm_submap_object_store;
 217
 218 /*
 219  *      Virtual memory objects are initialized from
 220  *      a template (see vm_object_allocate).
 221  *
 222  *      When adding a new field to the virtual memory
 223  *      object structure, be sure to add initialization
 224  *      (see _vm_object_allocate()).
 225  */
 226 static struct vm_object                 vm_object_template;
 227
 228 unsigned int vm_page_purged_wired = 0;
 229 unsigned int vm_page_purged_busy = 0;
 230 unsigned int vm_page_purged_others = 0;
 231
 232 #if VM_OBJECT_CACHE
 233 /*
 234  *      Virtual memory objects that are not referenced by
 235  *      any address maps, but that are allowed to persist
 236  *      (an attribute specified by the associated memory manager),
 237  *      are kept in a queue (vm_object_cached_list).
 238  *
 239  *      When an object from this queue is referenced again,
 240  *      for example to make another address space mapping,
 241  *      it must be removed from the queue.  That is, the
 242  *      queue contains *only* objects with zero references.
 243  *
 244  *      The kernel may choose to terminate objects from this
 245  *      queue in order to reclaim storage.  The current policy
 246  *      is to permit a fixed maximum number of unreferenced
 247  *      objects (vm_object_cached_max).
 248  *
 249  *      A spin lock (accessed by routines
 250  *      vm_object_cache_{lock,lock_try,unlock}) governs the
 251  *      object cache.  It must be held when objects are
 252  *      added to or removed from the cache (in vm_object_terminate).
 253  *      The routines that acquire a reference to a virtual
 254  *      memory object based on one of the memory object ports
 255  *      must also lock the cache.
 256  *
 257  *      Ideally, the object cache should be more isolated
 258  *      from the reference mechanism, so that the lock need
 259  *      not be held to make simple references.
 260  */
 261 static vm_object_t      vm_object_cache_trim(
 262                                 boolean_t called_from_vm_object_deallocate);
 263
 264 static queue_head_t     vm_object_cached_list;
 265 static int              vm_object_cached_count=0;
 266 static int              vm_object_cached_high;  /* highest # cached objects */
 267 static int              vm_object_cached_max = 512;     /* may be patched*/
 268
 269 static lck_mtx_t        vm_object_cached_lock_data;
 270 static lck_mtx_ext_t    vm_object_cached_lock_data_ext;
 271
 272 #define vm_object_cache_lock()          \
 273                 lck_mtx_lock(&vm_object_cached_lock_data)
 274 #define vm_object_cache_lock_try()              \
 275                 lck_mtx_try_lock(&vm_object_cached_lock_data)
 276 #define vm_object_cache_lock_spin()             \
 277                 lck_mtx_lock_spin(&vm_object_cached_lock_data)
 278 #define vm_object_cache_unlock()        \
 279                 lck_mtx_unlock(&vm_object_cached_lock_data)
 280
 281 #endif  /* VM_OBJECT_CACHE */
 282
 283
 284 static void             vm_object_deactivate_all_pages(
 285                                 vm_object_t     object);
 286
 287
 288 #define VM_OBJECT_HASH_COUNT            1024
 289 #define VM_OBJECT_HASH_LOCK_COUNT       512
 290
 291 static lck_mtx_t        vm_object_hashed_lock_data[VM_OBJECT_HASH_LOCK_COUNT];
 292 static lck_mtx_ext_t    vm_object_hashed_lock_data_ext[VM_OBJECT_HASH_LOCK_COUNT];
 293
 294 static queue_head_t     vm_object_hashtable[VM_OBJECT_HASH_COUNT];
 295 static struct zone      *vm_object_hash_zone;
 296
 297 struct vm_object_hash_entry {
 298         queue_chain_t           hash_link;      /* hash chain link */
 299         memory_object_t pager;          /* pager we represent */
 300         vm_object_t             object;         /* corresponding object */
 301         boolean_t               waiting;        /* someone waiting for
 302                                                  * termination */
 303 };
 304
 305 typedef struct vm_object_hash_entry     *vm_object_hash_entry_t;
 306 #define VM_OBJECT_HASH_ENTRY_NULL       ((vm_object_hash_entry_t) 0)
 307
 308 #define VM_OBJECT_HASH_SHIFT    5
 309 #define vm_object_hash(pager) \
 310         ((int)((((uintptr_t)pager) >> VM_OBJECT_HASH_SHIFT) % VM_OBJECT_HASH_COUNT))
 311
 312 #define vm_object_lock_hash(pager) \
 313         ((int)((((uintptr_t)pager) >> VM_OBJECT_HASH_SHIFT) % VM_OBJECT_HASH_LOCK_COUNT))
 314
 315 void vm_object_hash_entry_free(
 316         vm_object_hash_entry_t  entry);
 317
 318 static void vm_object_reap(vm_object_t object);
 319 static void vm_object_reap_async(vm_object_t object);
 320 static void vm_object_reaper_thread(void);
 321
 322 static lck_mtx_t        vm_object_reaper_lock_data;
 323 static lck_mtx_ext_t    vm_object_reaper_lock_data_ext;
 324
 325 static queue_head_t vm_object_reaper_queue; /* protected by vm_object_reaper_lock() */
 326 unsigned int vm_object_reap_count = 0;
 327 unsigned int vm_object_reap_count_async = 0;
 328
 329 #define vm_object_reaper_lock()         \
 330                 lck_mtx_lock(&vm_object_reaper_lock_data)
 331 #define vm_object_reaper_lock_spin()            \
 332                 lck_mtx_lock_spin(&vm_object_reaper_lock_data)
 333 #define vm_object_reaper_unlock()       \
 334                 lck_mtx_unlock(&vm_object_reaper_lock_data)
 335
 336
 337
 338 static lck_mtx_t *
 339 vm_object_hash_lock_spin(
 340         memory_object_t pager)
 341 {
 342         int     index;
 343
 344         index = vm_object_lock_hash(pager);
 345
 346         lck_mtx_lock_spin(&vm_object_hashed_lock_data[index]);
 347
 348         return (&vm_object_hashed_lock_data[index]);
 349 }
 350
 351 static void
 352 vm_object_hash_unlock(lck_mtx_t *lck)
 353 {
 354         lck_mtx_unlock(lck);
 355 }
 356
 357
 358 /*
 359  *      vm_object_hash_lookup looks up a pager in the hashtable
 360  *      and returns the corresponding entry, with optional removal.
 361  */
 362 static vm_object_hash_entry_t
 363 vm_object_hash_lookup(
 364         memory_object_t pager,
 365         boolean_t       remove_entry)
 366 {
 367         queue_t                 bucket;
 368         vm_object_hash_entry_t  entry;
 369
 370         bucket = &vm_object_hashtable[vm_object_hash(pager)];
 371
 372         entry = (vm_object_hash_entry_t)queue_first(bucket);
 373         while (!queue_end(bucket, (queue_entry_t)entry)) {
 374                 if (entry->pager == pager) {
 375                         if (remove_entry) {
 376                                 queue_remove(bucket, entry,
 377                                              vm_object_hash_entry_t, hash_link);
 378                         }
 379                         return(entry);
 380                 }
 381                 entry = (vm_object_hash_entry_t)queue_next(&entry->hash_link);
 382         }
 383         return(VM_OBJECT_HASH_ENTRY_NULL);
 384 }
 385
 386 /*
 387  *      vm_object_hash_enter enters the specified
 388  *      pager / cache object association in the hashtable.
 389  */
 390
 391 static void
 392 vm_object_hash_insert(
 393         vm_object_hash_entry_t  entry,
 394         vm_object_t             object)
 395 {
 396         queue_t         bucket;
 397
 398         bucket = &vm_object_hashtable[vm_object_hash(entry->pager)];
 399
 400         queue_enter(bucket, entry, vm_object_hash_entry_t, hash_link);
 401
 402         entry->object = object;
 403         object->hashed = TRUE;
 404 }
 405
 406 static vm_object_hash_entry_t
 407 vm_object_hash_entry_alloc(
 408         memory_object_t pager)
 409 {
 410         vm_object_hash_entry_t  entry;
 411
 412         entry = (vm_object_hash_entry_t)zalloc(vm_object_hash_zone);
 413         entry->pager = pager;
 414         entry->object = VM_OBJECT_NULL;
 415         entry->waiting = FALSE;
 416
 417         return(entry);
 418 }
 419
 420 void
 421 vm_object_hash_entry_free(
 422         vm_object_hash_entry_t  entry)
 423 {
 424         zfree(vm_object_hash_zone, entry);
 425 }
 426
 427 /*
 428  *      vm_object_allocate:
 429  *
 430  *      Returns a new object with the given size.
 431  */
 432
 433 __private_extern__ void
 434 _vm_object_allocate(
 435         vm_object_size_t        size,
 436         vm_object_t             object)
 437 {
 438         XPR(XPR_VM_OBJECT,
 439                 "vm_object_allocate, object 0x%X size 0x%X\n",
 440                 object, size, 0,0,0);
 441
 442         *object = vm_object_template;
 443         queue_init(&object->memq);
 444         queue_init(&object->msr_q);
 445 #if UPL_DEBUG
 446         queue_init(&object->uplq);
 447 #endif /* UPL_DEBUG */
 448         vm_object_lock_init(object);
 449         object->size = size;
 450 }
 451
 452 __private_extern__ vm_object_t
 453 vm_object_allocate(
 454         vm_object_size_t        size)
 455 {
 456         register vm_object_t object;
 457
 458         object = (vm_object_t) zalloc(vm_object_zone);
 459
 460 //      dbgLog(object, size, 0, 2);                     /* (TEST/DEBUG) */
 461
 462         if (object != VM_OBJECT_NULL)
 463                 _vm_object_allocate(size, object);
 464
 465         return object;
 466 }
 467
 468
 469 lck_grp_t               vm_object_lck_grp;
 470 lck_grp_attr_t  vm_object_lck_grp_attr;
 471 lck_attr_t              vm_object_lck_attr;
 472 lck_attr_t              kernel_object_lck_attr;
 473
 474 /*
 475  *      vm_object_bootstrap:
 476  *
 477  *      Initialize the VM objects module.
 478  */
 479 __private_extern__ void
 480 vm_object_bootstrap(void)
 481 {
 482         register int    i;
 483
 484         vm_object_zone = zinit((vm_size_t) sizeof(struct vm_object),
 485                                 round_page(512*1024),
 486                                 round_page(12*1024),
 487                                 "vm objects");
 488         zone_change(vm_object_zone, Z_NOENCRYPT, TRUE);
 489
 490         vm_object_init_lck_grp();
 491
 492 #if VM_OBJECT_CACHE
 493         queue_init(&vm_object_cached_list);
 494
 495         lck_mtx_init_ext(&vm_object_cached_lock_data,
 496                 &vm_object_cached_lock_data_ext,
 497                 &vm_object_lck_grp,
 498                 &vm_object_lck_attr);
 499 #endif
 500         queue_init(&vm_object_reaper_queue);
 501
 502         for (i = 0; i < VM_OBJECT_HASH_LOCK_COUNT; i++) {
 503                 lck_mtx_init_ext(&vm_object_hashed_lock_data[i],
 504                                  &vm_object_hashed_lock_data_ext[i],
 505                                  &vm_object_lck_grp,
 506                                  &vm_object_lck_attr);
 507         }
 508         lck_mtx_init_ext(&vm_object_reaper_lock_data,
 509                 &vm_object_reaper_lock_data_ext,
 510                 &vm_object_lck_grp,
 511                 &vm_object_lck_attr);
 512
 513         vm_object_hash_zone =
 514                         zinit((vm_size_t) sizeof (struct vm_object_hash_entry),
 515                               round_page(512*1024),
 516                               round_page(12*1024),
 517                               "vm object hash entries");
 518         zone_change(vm_object_hash_zone, Z_NOENCRYPT, TRUE);
 519
 520         for (i = 0; i < VM_OBJECT_HASH_COUNT; i++)
 521                 queue_init(&vm_object_hashtable[i]);
 522
 523
 524         /*
 525          *      Fill in a template object, for quick initialization
 526          */
 527
 528         /* memq; Lock; init after allocation */
 529         vm_object_template.memq.prev = NULL;
 530         vm_object_template.memq.next = NULL;
 531 #if 0
 532         /*
 533          * We can't call vm_object_lock_init() here because that will
 534          * allocate some memory and VM is not fully initialized yet.
 535          * The lock will be initialized for each allocated object in
 536          * _vm_object_allocate(), so we don't need to initialize it in
 537          * the vm_object_template.
 538          */
 539         vm_object_lock_init(&vm_object_template);
 540 #endif
 541         vm_object_template.size = 0;
 542         vm_object_template.memq_hint = VM_PAGE_NULL;
 543         vm_object_template.ref_count = 1;
 544 #if     TASK_SWAPPER
 545         vm_object_template.res_count = 1;
 546 #endif  /* TASK_SWAPPER */
 547         vm_object_template.resident_page_count = 0;
 548         vm_object_template.wired_page_count = 0;
 549         vm_object_template.reusable_page_count = 0;
 550         vm_object_template.copy = VM_OBJECT_NULL;
 551         vm_object_template.shadow = VM_OBJECT_NULL;
 552         vm_object_template.shadow_offset = (vm_object_offset_t) 0;
 553         vm_object_template.pager = MEMORY_OBJECT_NULL;
 554         vm_object_template.paging_offset = 0;
 555         vm_object_template.pager_control = MEMORY_OBJECT_CONTROL_NULL;
 556         vm_object_template.copy_strategy = MEMORY_OBJECT_COPY_SYMMETRIC;
 557         vm_object_template.paging_in_progress = 0;
 558         vm_object_template.activity_in_progress = 0;
 559
 560         /* Begin bitfields */
 561         vm_object_template.all_wanted = 0; /* all bits FALSE */
 562         vm_object_template.pager_created = FALSE;
 563         vm_object_template.pager_initialized = FALSE;
 564         vm_object_template.pager_ready = FALSE;
 565         vm_object_template.pager_trusted = FALSE;
 566         vm_object_template.can_persist = FALSE;
 567         vm_object_template.internal = TRUE;
 568         vm_object_template.temporary = TRUE;
 569         vm_object_template.private = FALSE;
 570         vm_object_template.pageout = FALSE;
 571         vm_object_template.alive = TRUE;
 572         vm_object_template.purgable = VM_PURGABLE_DENY;
 573         vm_object_template.shadowed = FALSE;
 574         vm_object_template.silent_overwrite = FALSE;
 575         vm_object_template.advisory_pageout = FALSE;
 576         vm_object_template.true_share = FALSE;
 577         vm_object_template.terminating = FALSE;
 578         vm_object_template.named = FALSE;
 579         vm_object_template.shadow_severed = FALSE;
 580         vm_object_template.phys_contiguous = FALSE;
 581         vm_object_template.nophyscache = FALSE;
 582         /* End bitfields */
 583
 584         vm_object_template.cached_list.prev = NULL;
 585         vm_object_template.cached_list.next = NULL;
 586         vm_object_template.msr_q.prev = NULL;
 587         vm_object_template.msr_q.next = NULL;
 588
 589         vm_object_template.last_alloc = (vm_object_offset_t) 0;
 590         vm_object_template.sequential = (vm_object_offset_t) 0;
 591         vm_object_template.pages_created = 0;
 592         vm_object_template.pages_used = 0;
 593
 594 #if     MACH_PAGEMAP
 595         vm_object_template.existence_map = VM_EXTERNAL_NULL;
 596 #endif  /* MACH_PAGEMAP */
 597         vm_object_template.cow_hint = ~(vm_offset_t)0;
 598 #if     MACH_ASSERT
 599         vm_object_template.paging_object = VM_OBJECT_NULL;
 600 #endif  /* MACH_ASSERT */
 601
 602         /* cache bitfields */
 603         vm_object_template.wimg_bits = VM_WIMG_DEFAULT;
 604         vm_object_template.code_signed = FALSE;
 605         vm_object_template.hashed = FALSE;
 606         vm_object_template.transposed = FALSE;
 607         vm_object_template.mapping_in_progress = FALSE;
 608         vm_object_template.volatile_empty = FALSE;
 609         vm_object_template.volatile_fault = FALSE;
 610         vm_object_template.all_reusable = FALSE;
 611         vm_object_template.blocked_access = FALSE;
 612         vm_object_template.__object2_unused_bits = 0;
 613 #if UPL_DEBUG
 614         vm_object_template.uplq.prev = NULL;
 615         vm_object_template.uplq.next = NULL;
 616 #endif /* UPL_DEBUG */
 617 #ifdef VM_PIP_DEBUG
 618         bzero(&vm_object_template.pip_holders,
 619               sizeof (vm_object_template.pip_holders));
 620 #endif /* VM_PIP_DEBUG */
 621
 622         vm_object_template.objq.next=NULL;
 623         vm_object_template.objq.prev=NULL;
 624
 625
 626         /*
 627          *      Initialize the "kernel object"
 628          */
 629
 630         kernel_object = &kernel_object_store;
 631
 632 /*
 633  *      Note that in the following size specifications, we need to add 1 because
 634  *      VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size.
 635  */
 636
 637 #ifdef ppc
 638         _vm_object_allocate(vm_last_addr + 1,
 639                             kernel_object);
 640 #else
 641         _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1,
 642                             kernel_object);
 643 #endif
 644         kernel_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 645
 646         /*
 647          *      Initialize the "submap object".  Make it as large as the
 648          *      kernel object so that no limit is imposed on submap sizes.
 649          */
 650
 651         vm_submap_object = &vm_submap_object_store;
 652 #ifdef ppc
 653         _vm_object_allocate(vm_last_addr + 1,
 654                             vm_submap_object);
 655 #else
 656         _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1,
 657                             vm_submap_object);
 658 #endif
 659         vm_submap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 660
 661         /*
 662          * Create an "extra" reference to this object so that we never
 663          * try to deallocate it; zfree doesn't like to be called with
 664          * non-zone memory.
 665          */
 666         vm_object_reference(vm_submap_object);
 667
 668 #if     MACH_PAGEMAP
 669         vm_external_module_initialize();
 670 #endif  /* MACH_PAGEMAP */
 671 }
 672
 673 void
 674 vm_object_reaper_init(void)
 675 {
 676         kern_return_t   kr;
 677         thread_t        thread;
 678
 679         kr = kernel_thread_start_priority(
 680                 (thread_continue_t) vm_object_reaper_thread,
 681                 NULL,
 682                 BASEPRI_PREEMPT - 1,
 683                 &thread);
 684         if (kr != KERN_SUCCESS) {
 685                 panic("failed to launch vm_object_reaper_thread kr=0x%x", kr);
 686         }
 687         thread_deallocate(thread);
 688 }
 689
 690 __private_extern__ void
 691 vm_object_init(void)
 692 {
 693         /*
 694          *      Finish initializing the kernel object.
 695          */
 696 }
 697
 698
 699 __private_extern__ void
 700 vm_object_init_lck_grp(void)
 701 {
 702         /*
 703          * initialze the vm_object lock world
 704          */
 705         lck_grp_attr_setdefault(&vm_object_lck_grp_attr);
 706         lck_grp_init(&vm_object_lck_grp, "vm_object", &vm_object_lck_grp_attr);
 707         lck_attr_setdefault(&vm_object_lck_attr);
 708         lck_attr_setdefault(&kernel_object_lck_attr);
 709         lck_attr_cleardebug(&kernel_object_lck_attr);
 710 }
 711
 712 #if VM_OBJECT_CACHE
 713 #define MIGHT_NOT_CACHE_SHADOWS         1
 714 #if     MIGHT_NOT_CACHE_SHADOWS
 715 static int cache_shadows = TRUE;
 716 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
 717 #endif
 718
 719 /*
 720  *      vm_object_deallocate:
 721  *
 722  *      Release a reference to the specified object,
 723  *      gained either through a vm_object_allocate
 724  *      or a vm_object_reference call.  When all references
 725  *      are gone, storage associated with this object
 726  *      may be relinquished.
 727  *
 728  *      No object may be locked.
 729  */
 730 unsigned long vm_object_deallocate_shared_successes = 0;
 731 unsigned long vm_object_deallocate_shared_failures = 0;
 732 unsigned long vm_object_deallocate_shared_swap_failures = 0;
 733 __private_extern__ void
 734 vm_object_deallocate(
 735         register vm_object_t    object)
 736 {
 737 #if VM_OBJECT_CACHE
 738         boolean_t       retry_cache_trim = FALSE;
 739         uint32_t        try_failed_count = 0;
 740 #endif
 741         vm_object_t     shadow = VM_OBJECT_NULL;
 742
 743 //      if(object)dbgLog(object, object->ref_count, object->can_persist, 3);    /* (TEST/DEBUG) */
 744 //      else dbgLog(object, 0, 0, 3);   /* (TEST/DEBUG) */
 745
 746         if (object == VM_OBJECT_NULL)
 747                 return;
 748
 749         if (object == kernel_object) {
 750                 vm_object_lock_shared(object);
 751
 752                 OSAddAtomic(-1, &object->ref_count);
 753
 754                 if (object->ref_count == 0) {
 755                         panic("vm_object_deallocate: losing kernel_object\n");
 756                 }
 757                 vm_object_unlock(object);
 758                 return;
 759         }
 760
 761         if (object->ref_count > 2 ||
 762             (!object->named && object->ref_count > 1)) {
 763                 UInt32          original_ref_count;
 764                 volatile UInt32 *ref_count_p;
 765                 Boolean         atomic_swap;
 766
 767                 /*
 768                  * The object currently looks like it is not being
 769                  * kept alive solely by the reference we're about to release.
 770                  * Let's try and release our reference without taking
 771                  * all the locks we would need if we had to terminate the
 772                  * object (cache lock + exclusive object lock).
 773                  * Lock the object "shared" to make sure we don't race with
 774                  * anyone holding it "exclusive".
 775                  */
 776                 vm_object_lock_shared(object);
 777                 ref_count_p = (volatile UInt32 *) &object->ref_count;
 778                 original_ref_count = object->ref_count;
 779                 /*
 780                  * Test again as "ref_count" could have changed.
 781                  * "named" shouldn't change.
 782                  */
 783                 if (original_ref_count > 2 ||
 784                     (!object->named && original_ref_count > 1)) {
 785                         atomic_swap = OSCompareAndSwap(
 786                                 original_ref_count,
 787                                 original_ref_count - 1,
 788                                 (UInt32 *) &object->ref_count);
 789                         if (atomic_swap == FALSE) {
 790                                 vm_object_deallocate_shared_swap_failures++;
 791                         }
 792
 793                 } else {
 794                         atomic_swap = FALSE;
 795                 }
 796                 vm_object_unlock(object);
 797
 798                 if (atomic_swap) {
 799                         /*
 800                          * ref_count was updated atomically !
 801                          */
 802                         vm_object_deallocate_shared_successes++;
 803                         return;
 804                 }
 805
 806                 /*
 807                  * Someone else updated the ref_count at the same
 808                  * time and we lost the race.  Fall back to the usual
 809                  * slow but safe path...
 810                  */
 811                 vm_object_deallocate_shared_failures++;
 812         }
 813
 814         while (object != VM_OBJECT_NULL) {
 815
 816                 vm_object_lock(object);
 817
 818                 assert(object->ref_count > 0);
 819
 820                 /*
 821                  *      If the object has a named reference, and only
 822                  *      that reference would remain, inform the pager
 823                  *      about the last "mapping" reference going away.
 824                  */
 825                 if ((object->ref_count == 2)  && (object->named)) {
 826                         memory_object_t pager = object->pager;
 827
 828                         /* Notify the Pager that there are no */
 829                         /* more mappers for this object */
 830
 831                         if (pager != MEMORY_OBJECT_NULL) {
 832                                 vm_object_mapping_wait(object, THREAD_UNINT);
 833                                 vm_object_mapping_begin(object);
 834                                 vm_object_unlock(object);
 835
 836                                 memory_object_last_unmap(pager);
 837
 838                                 vm_object_lock(object);
 839                                 vm_object_mapping_end(object);
 840                         }
 841                         /*
 842                          * recheck the ref_count since we dropped the object lock
 843                          * to call 'memory_object_last_unmap'... it's possible
 844                          * additional references got taken and we only want
 845                          * to deactivate the pages if this 'named' object will only
 846                          * referenced by the backing pager once we drop our reference
 847                          * below
 848                          */
 849                         if (!object->terminating && object->ref_count == 2)
 850                                 vm_object_deactivate_all_pages(object);
 851
 852                         assert(object->ref_count > 0);
 853                 }
 854
 855                 /*
 856                  *      Lose the reference. If other references
 857                  *      remain, then we are done, unless we need
 858                  *      to retry a cache trim.
 859                  *      If it is the last reference, then keep it
 860                  *      until any pending initialization is completed.
 861                  */
 862
 863                 /* if the object is terminating, it cannot go into */
 864                 /* the cache and we obviously should not call      */
 865                 /* terminate again.  */
 866
 867                 if ((object->ref_count > 1) || object->terminating) {
 868                         vm_object_lock_assert_exclusive(object);
 869                         object->ref_count--;
 870                         vm_object_res_deallocate(object);
 871
 872                         if (object->ref_count == 1 &&
 873                             object->shadow != VM_OBJECT_NULL) {
 874                                 /*
 875                                  * There's only one reference left on this
 876                                  * VM object.  We can't tell if it's a valid
 877                                  * one (from a mapping for example) or if this
 878                                  * object is just part of a possibly stale and
 879                                  * useless shadow chain.
 880                                  * We would like to try and collapse it into
 881                                  * its parent, but we don't have any pointers
 882                                  * back to this parent object.
 883                                  * But we can try and collapse this object with
 884                                  * its own shadows, in case these are useless
 885                                  * too...
 886                                  * We can't bypass this object though, since we
 887                                  * don't know if this last reference on it is
 888                                  * meaningful or not.
 889                                  */
 890                                 vm_object_collapse(object, 0, FALSE);
 891                         }
 892                         vm_object_unlock(object);
 893 #if VM_OBJECT_CACHE
 894                         if (retry_cache_trim &&
 895                             ((object = vm_object_cache_trim(TRUE)) !=
 896                              VM_OBJECT_NULL)) {
 897                                 continue;
 898                         }
 899 #endif
 900                         return;
 901                 }
 902
 903                 /*
 904                  *      We have to wait for initialization
 905                  *      before destroying or caching the object.
 906                  */
 907
 908                 if (object->pager_created && ! object->pager_initialized) {
 909                         assert(! object->can_persist);
 910                         vm_object_assert_wait(object,
 911                                               VM_OBJECT_EVENT_INITIALIZED,
 912                                               THREAD_UNINT);
 913                         vm_object_unlock(object);
 914
 915                         thread_block(THREAD_CONTINUE_NULL);
 916                         continue;
 917                 }
 918
 919 #if VM_OBJECT_CACHE
 920                 /*
 921                  *      If this object can persist, then enter it in
 922                  *      the cache. Otherwise, terminate it.
 923                  *
 924                  *      NOTE:  Only permanent objects are cached, and
 925                  *      permanent objects cannot have shadows.  This
 926                  *      affects the residence counting logic in a minor
 927                  *      way (can do it in-line, mostly).
 928                  */
 929
 930                 if ((object->can_persist) && (object->alive)) {
 931                         /*
 932                          *      Now it is safe to decrement reference count,
 933                          *      and to return if reference count is > 0.
 934                          */
 935
 936                         vm_object_lock_assert_exclusive(object);
 937                         if (--object->ref_count > 0) {
 938                                 vm_object_res_deallocate(object);
 939                                 vm_object_unlock(object);
 940
 941                                 if (retry_cache_trim &&
 942                                     ((object = vm_object_cache_trim(TRUE)) !=
 943                                      VM_OBJECT_NULL)) {
 944                                         continue;
 945                                 }
 946                                 return;
 947                         }
 948
 949 #if     MIGHT_NOT_CACHE_SHADOWS
 950                         /*
 951                          *      Remove shadow now if we don't
 952                          *      want to cache shadows.
 953                          */
 954                         if (! cache_shadows) {
 955                                 shadow = object->shadow;
 956                                 object->shadow = VM_OBJECT_NULL;
 957                         }
 958 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
 959
 960                         /*
 961                          *      Enter the object onto the queue of
 962                          *      cached objects, and deactivate
 963                          *      all of its pages.
 964                          */
 965                         assert(object->shadow == VM_OBJECT_NULL);
 966                         VM_OBJ_RES_DECR(object);
 967                         XPR(XPR_VM_OBJECT,
 968                       "vm_o_deallocate: adding %x to cache, queue = (%x, %x)\n",
 969                                 object,
 970                                 vm_object_cached_list.next,
 971                                 vm_object_cached_list.prev,0,0);
 972
 973
 974                         vm_object_unlock(object);
 975
 976                         try_failed_count = 0;
 977                         for (;;) {
 978                                 vm_object_cache_lock();
 979
 980                                 /*
 981                                  * if we try to take a regular lock here
 982                                  * we risk deadlocking against someone
 983                                  * holding a lock on this object while
 984                                  * trying to vm_object_deallocate a different
 985                                  * object
 986                                  */
 987                                 if (vm_object_lock_try(object))
 988                                         break;
 989                                 vm_object_cache_unlock();
 990                                 try_failed_count++;
 991
 992                                 mutex_pause(try_failed_count);  /* wait a bit */
 993                         }
 994                         vm_object_cached_count++;
 995                         if (vm_object_cached_count > vm_object_cached_high)
 996                                 vm_object_cached_high = vm_object_cached_count;
 997                         queue_enter(&vm_object_cached_list, object,
 998                                 vm_object_t, cached_list);
 999                         vm_object_cache_unlock();
1000
1001                         vm_object_deactivate_all_pages(object);
1002                         vm_object_unlock(object);
1003
1004 #if     MIGHT_NOT_CACHE_SHADOWS
1005                         /*
1006                          *      If we have a shadow that we need
1007                          *      to deallocate, do so now, remembering
1008                          *      to trim the cache later.
1009                          */
1010                         if (! cache_shadows && shadow != VM_OBJECT_NULL) {
1011                                 object = shadow;
1012                                 retry_cache_trim = TRUE;
1013                                 continue;
1014                         }
1015 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
1016
1017                         /*
1018                          *      Trim the cache. If the cache trim
1019                          *      returns with a shadow for us to deallocate,
1020                          *      then remember to retry the cache trim
1021                          *      when we are done deallocating the shadow.
1022                          *      Otherwise, we are done.
1023                          */
1024
1025                         object = vm_object_cache_trim(TRUE);
1026                         if (object == VM_OBJECT_NULL) {
1027                                 return;
1028                         }
1029                         retry_cache_trim = TRUE;
1030                 } else
1031 #endif  /* VM_OBJECT_CACHE */
1032                 {
1033                         /*
1034                          *      This object is not cachable; terminate it.
1035                          */
1036                         XPR(XPR_VM_OBJECT,
1037          "vm_o_deallocate: !cacheable 0x%X res %d paging_ops %d thread 0x%p ref %d\n",
1038                             object, object->resident_page_count,
1039                             object->paging_in_progress,
1040                             (void *)current_thread(),object->ref_count);
1041
1042                         VM_OBJ_RES_DECR(object);        /* XXX ? */
1043                         /*
1044                          *      Terminate this object. If it had a shadow,
1045                          *      then deallocate it; otherwise, if we need
1046                          *      to retry a cache trim, do so now; otherwise,
1047                          *      we are done. "pageout" objects have a shadow,
1048                          *      but maintain a "paging reference" rather than
1049                          *      a normal reference.
1050                          */
1051                         shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
1052
1053                         if (vm_object_terminate(object) != KERN_SUCCESS) {
1054                                 return;
1055                         }
1056                         if (shadow != VM_OBJECT_NULL) {
1057                                 object = shadow;
1058                                 continue;
1059                         }
1060 #if VM_OBJECT_CACHE
1061                         if (retry_cache_trim &&
1062                             ((object = vm_object_cache_trim(TRUE)) !=
1063                              VM_OBJECT_NULL)) {
1064                                 continue;
1065                         }
1066 #endif
1067                         return;
1068                 }
1069         }
1070 #if VM_OBJECT_CACHE
1071         assert(! retry_cache_trim);
1072 #endif
1073 }
1074
1075
1076 #if VM_OBJECT_CACHE
1077 /*
1078  *      Check to see whether we really need to trim
1079  *      down the cache. If so, remove an object from
1080  *      the cache, terminate it, and repeat.
1081  *
1082  *      Called with, and returns with, cache lock unlocked.
1083  */
1084 vm_object_t
1085 vm_object_cache_trim(
1086         boolean_t called_from_vm_object_deallocate)
1087 {
1088         register vm_object_t object = VM_OBJECT_NULL;
1089         vm_object_t shadow;
1090
1091         for (;;) {
1092
1093                 /*
1094                  *      If we no longer need to trim the cache,
1095                  *      then we are done.
1096                  */
1097                 if (vm_object_cached_count <= vm_object_cached_max)
1098                         return VM_OBJECT_NULL;
1099
1100                 vm_object_cache_lock();
1101                 if (vm_object_cached_count <= vm_object_cached_max) {
1102                         vm_object_cache_unlock();
1103                         return VM_OBJECT_NULL;
1104                 }
1105
1106                 /*
1107                  *      We must trim down the cache, so remove
1108                  *      the first object in the cache.
1109                  */
1110                 XPR(XPR_VM_OBJECT,
1111                 "vm_object_cache_trim: removing from front of cache (%x, %x)\n",
1112                         vm_object_cached_list.next,
1113                         vm_object_cached_list.prev, 0, 0, 0);
1114
1115                 object = (vm_object_t) queue_first(&vm_object_cached_list);
1116                 if(object == (vm_object_t) &vm_object_cached_list) {
1117                         /* something's wrong with the calling parameter or */
1118                         /* the value of vm_object_cached_count, just fix   */
1119                         /* and return */
1120                         if(vm_object_cached_max < 0)
1121                                 vm_object_cached_max = 0;
1122                         vm_object_cached_count = 0;
1123                         vm_object_cache_unlock();
1124                         return VM_OBJECT_NULL;
1125                 }
1126                 vm_object_lock(object);
1127                 queue_remove(&vm_object_cached_list, object, vm_object_t,
1128                              cached_list);
1129                 vm_object_cached_count--;
1130
1131                 vm_object_cache_unlock();
1132                 /*
1133                  *      Since this object is in the cache, we know
1134                  *      that it is initialized and has no references.
1135                  *      Take a reference to avoid recursive deallocations.
1136                  */
1137
1138                 assert(object->pager_initialized);
1139                 assert(object->ref_count == 0);
1140                 vm_object_lock_assert_exclusive(object);
1141                 object->ref_count++;
1142
1143                 /*
1144                  *      Terminate the object.
1145                  *      If the object had a shadow, we let vm_object_deallocate
1146                  *      deallocate it. "pageout" objects have a shadow, but
1147                  *      maintain a "paging reference" rather than a normal
1148                  *      reference.
1149                  *      (We are careful here to limit recursion.)
1150                  */
1151                 shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
1152
1153                 if(vm_object_terminate(object) != KERN_SUCCESS)
1154                         continue;
1155
1156                 if (shadow != VM_OBJECT_NULL) {
1157                         if (called_from_vm_object_deallocate) {
1158                                 return shadow;
1159                         } else {
1160                                 vm_object_deallocate(shadow);
1161                         }
1162                 }
1163         }
1164 }
1165 #endif
1166
1167
1168 /*
1169  *      Routine:        vm_object_terminate
1170  *      Purpose:
1171  *              Free all resources associated with a vm_object.
1172  *      In/out conditions:
1173  *              Upon entry, the object must be locked,
1174  *              and the object must have exactly one reference.
1175  *
1176  *              The shadow object reference is left alone.
1177  *
1178  *              The object must be unlocked if its found that pages
1179  *              must be flushed to a backing object.  If someone
1180  *              manages to map the object while it is being flushed
1181  *              the object is returned unlocked and unchanged.  Otherwise,
1182  *              upon exit, the cache will be unlocked, and the
1183  *              object will cease to exist.
1184  */
1185 static kern_return_t
1186 vm_object_terminate(
1187         vm_object_t     object)
1188 {
1189         vm_object_t     shadow_object;
1190
1191         XPR(XPR_VM_OBJECT, "vm_object_terminate, object 0x%X ref %d\n",
1192                 object, object->ref_count, 0, 0, 0);
1193
1194         if (!object->pageout && (!object->temporary || object->can_persist) &&
1195             (object->pager != NULL || object->shadow_severed)) {
1196                 /*
1197                  * Clear pager_trusted bit so that the pages get yanked
1198                  * out of the object instead of cleaned in place.  This
1199                  * prevents a deadlock in XMM and makes more sense anyway.
1200                  */
1201                 object->pager_trusted = FALSE;
1202
1203                 vm_object_reap_pages(object, REAP_TERMINATE);
1204         }
1205         /*
1206          *      Make sure the object isn't already being terminated
1207          */
1208         if (object->terminating) {
1209                 vm_object_lock_assert_exclusive(object);
1210                 object->ref_count--;
1211                 assert(object->ref_count > 0);
1212                 vm_object_unlock(object);
1213                 return KERN_FAILURE;
1214         }
1215
1216         /*
1217          * Did somebody get a reference to the object while we were
1218          * cleaning it?
1219          */
1220         if (object->ref_count != 1) {
1221                 vm_object_lock_assert_exclusive(object);
1222                 object->ref_count--;
1223                 assert(object->ref_count > 0);
1224                 vm_object_res_deallocate(object);
1225                 vm_object_unlock(object);
1226                 return KERN_FAILURE;
1227         }
1228
1229         /*
1230          *      Make sure no one can look us up now.
1231          */
1232
1233         object->terminating = TRUE;
1234         object->alive = FALSE;
1235
1236         if (object->hashed) {
1237                 lck_mtx_t       *lck;
1238
1239                 lck = vm_object_hash_lock_spin(object->pager);
1240                 vm_object_remove(object);
1241                 vm_object_hash_unlock(lck);
1242         }
1243         /*
1244          *      Detach the object from its shadow if we are the shadow's
1245          *      copy. The reference we hold on the shadow must be dropped
1246          *      by our caller.
1247          */
1248         if (((shadow_object = object->shadow) != VM_OBJECT_NULL) &&
1249             !(object->pageout)) {
1250                 vm_object_lock(shadow_object);
1251                 if (shadow_object->copy == object)
1252                         shadow_object->copy = VM_OBJECT_NULL;
1253                 vm_object_unlock(shadow_object);
1254         }
1255
1256         if (object->paging_in_progress != 0 ||
1257             object->activity_in_progress != 0) {
1258                 /*
1259                  * There are still some paging_in_progress references
1260                  * on this object, meaning that there are some paging
1261                  * or other I/O operations in progress for this VM object.
1262                  * Such operations take some paging_in_progress references
1263                  * up front to ensure that the object doesn't go away, but
1264                  * they may also need to acquire a reference on the VM object,
1265                  * to map it in kernel space, for example.  That means that
1266                  * they may end up releasing the last reference on the VM
1267                  * object, triggering its termination, while still holding
1268                  * paging_in_progress references.  Waiting for these
1269                  * pending paging_in_progress references to go away here would
1270                  * deadlock.
1271                  *
1272                  * To avoid deadlocking, we'll let the vm_object_reaper_thread
1273                  * complete the VM object termination if it still holds
1274                  * paging_in_progress references at this point.
1275                  *
1276                  * No new paging_in_progress should appear now that the
1277                  * VM object is "terminating" and not "alive".
1278                  */
1279                 vm_object_reap_async(object);
1280                 vm_object_unlock(object);
1281                 /*
1282                  * Return KERN_FAILURE to let the caller know that we
1283                  * haven't completed the termination and it can't drop this
1284                  * object's reference on its shadow object yet.
1285                  * The reaper thread will take care of that once it has
1286                  * completed this object's termination.
1287                  */
1288                 return KERN_FAILURE;
1289         }
1290         /*
1291          * complete the VM object termination
1292          */
1293         vm_object_reap(object);
1294         object = VM_OBJECT_NULL;
1295
1296         /*
1297          * the object lock was released by vm_object_reap()
1298          *
1299          * KERN_SUCCESS means that this object has been terminated
1300          * and no longer needs its shadow object but still holds a
1301          * reference on it.
1302          * The caller is responsible for dropping that reference.
1303          * We can't call vm_object_deallocate() here because that
1304          * would create a recursion.
1305          */
1306         return KERN_SUCCESS;
1307 }
1308
1309
1310 /*
1311  * vm_object_reap():
1312  *
1313  * Complete the termination of a VM object after it's been marked
1314  * as "terminating" and "!alive" by vm_object_terminate().
1315  *
1316  * The VM object must be locked by caller.
1317  * The lock will be released on return and the VM object is no longer valid.
1318  */
1319 void
1320 vm_object_reap(
1321         vm_object_t object)
1322 {
1323         memory_object_t         pager;
1324
1325         vm_object_lock_assert_exclusive(object);
1326         assert(object->paging_in_progress == 0);
1327         assert(object->activity_in_progress == 0);
1328
1329         vm_object_reap_count++;
1330
1331         pager = object->pager;
1332         object->pager = MEMORY_OBJECT_NULL;
1333
1334         if (pager != MEMORY_OBJECT_NULL)
1335                 memory_object_control_disable(object->pager_control);
1336
1337         object->ref_count--;
1338 #if     TASK_SWAPPER
1339         assert(object->res_count == 0);
1340 #endif  /* TASK_SWAPPER */
1341
1342         assert (object->ref_count == 0);
1343
1344         /*
1345          * remove from purgeable queue if it's on
1346          */
1347         if (object->objq.next || object->objq.prev) {
1348                 purgeable_q_t queue = vm_purgeable_object_remove(object);
1349                 assert(queue);
1350
1351                 /* Must take page lock for this - using it to protect token queue */
1352                 vm_page_lock_queues();
1353                 vm_purgeable_token_delete_first(queue);
1354
1355                 assert(queue->debug_count_objects>=0);
1356                 vm_page_unlock_queues();
1357         }
1358
1359         /*
1360          *      Clean or free the pages, as appropriate.
1361          *      It is possible for us to find busy/absent pages,
1362          *      if some faults on this object were aborted.
1363          */
1364         if (object->pageout) {
1365                 assert(object->shadow != VM_OBJECT_NULL);
1366
1367                 vm_pageout_object_terminate(object);
1368
1369         } else if (((object->temporary && !object->can_persist) || (pager == MEMORY_OBJECT_NULL))) {
1370
1371                 vm_object_reap_pages(object, REAP_REAP);
1372         }
1373         assert(queue_empty(&object->memq));
1374         assert(object->paging_in_progress == 0);
1375         assert(object->activity_in_progress == 0);
1376         assert(object->ref_count == 0);
1377
1378         /*
1379          * If the pager has not already been released by
1380          * vm_object_destroy, we need to terminate it and
1381          * release our reference to it here.
1382          */
1383         if (pager != MEMORY_OBJECT_NULL) {
1384                 vm_object_unlock(object);
1385                 vm_object_release_pager(pager, object->hashed);
1386                 vm_object_lock(object);
1387         }
1388
1389         /* kick off anyone waiting on terminating */
1390         object->terminating = FALSE;
1391         vm_object_paging_begin(object);
1392         vm_object_paging_end(object);
1393         vm_object_unlock(object);
1394
1395 #if     MACH_PAGEMAP
1396         vm_external_destroy(object->existence_map, object->size);
1397 #endif  /* MACH_PAGEMAP */
1398
1399         object->shadow = VM_OBJECT_NULL;
1400
1401         vm_object_lock_destroy(object);
1402         /*
1403          *      Free the space for the object.
1404          */
1405         zfree(vm_object_zone, object);
1406         object = VM_OBJECT_NULL;
1407 }
1408
1409
1410
1411 #define V_O_R_MAX_BATCH 128
1412
1413
1414 #define VM_OBJ_REAP_FREELIST(_local_free_q, do_disconnect)              \
1415         MACRO_BEGIN                                                     \
1416         if (_local_free_q) {                                            \
1417                 if (do_disconnect) {                                    \
1418                         vm_page_t m;                                    \
1419                         for (m = _local_free_q;                         \
1420                              m != VM_PAGE_NULL;                         \
1421                              m = (vm_page_t) m->pageq.next) {           \
1422                                 if (m->pmapped) {                       \
1423                                         pmap_disconnect(m->phys_page);  \
1424                                 }                                       \
1425                         }                                               \
1426                 }                                                       \
1427                 vm_page_free_list(_local_free_q, TRUE);                 \
1428                 _local_free_q = VM_PAGE_NULL;                           \
1429         }                                                               \
1430         MACRO_END
1431
1432
1433 void
1434 vm_object_reap_pages(
1435         vm_object_t     object,
1436         int             reap_type)
1437 {
1438         vm_page_t       p;
1439         vm_page_t       next;
1440         vm_page_t       local_free_q = VM_PAGE_NULL;
1441         int             loop_count;
1442         boolean_t       disconnect_on_release;
1443
1444         if (reap_type == REAP_DATA_FLUSH) {
1445                 /*
1446                  * We need to disconnect pages from all pmaps before
1447                  * releasing them to the free list
1448                  */
1449                 disconnect_on_release = TRUE;
1450         } else {
1451                 /*
1452                  * Either the caller has already disconnected the pages
1453                  * from all pmaps, or we disconnect them here as we add
1454                  * them to out local list of pages to be released.
1455                  * No need to re-disconnect them when we release the pages
1456                  * to the free list.
1457                  */
1458                 disconnect_on_release = FALSE;
1459         }
1460
1461 restart_after_sleep:
1462         if (queue_empty(&object->memq))
1463                 return;
1464         loop_count = V_O_R_MAX_BATCH + 1;
1465
1466         vm_page_lockspin_queues();
1467
1468         next = (vm_page_t)queue_first(&object->memq);
1469
1470         while (!queue_end(&object->memq, (queue_entry_t)next)) {
1471
1472                 p = next;
1473                 next = (vm_page_t)queue_next(&next->listq);
1474
1475                 if (--loop_count == 0) {
1476
1477                         vm_page_unlock_queues();
1478
1479                         if (local_free_q) {
1480                                 /*
1481                                  * Free the pages we reclaimed so far
1482                                  * and take a little break to avoid
1483                                  * hogging the page queue lock too long
1484                                  */
1485                                 VM_OBJ_REAP_FREELIST(local_free_q,
1486                                                      disconnect_on_release);
1487                         } else
1488                                 mutex_pause(0);
1489
1490                         loop_count = V_O_R_MAX_BATCH + 1;
1491
1492                         vm_page_lockspin_queues();
1493                 }
1494                 if (reap_type == REAP_DATA_FLUSH || reap_type == REAP_TERMINATE) {
1495
1496                         if (reap_type == REAP_DATA_FLUSH &&
1497                             ((p->pageout == TRUE || p->cleaning == TRUE) && p->list_req_pending == TRUE)) {
1498                                 p->list_req_pending = FALSE;
1499                                 p->cleaning = FALSE;
1500                                 /*
1501                                  * need to drop the laundry count...
1502                                  * we may also need to remove it
1503                                  * from the I/O paging queue...
1504                                  * vm_pageout_throttle_up handles both cases
1505                                  *
1506                                  * the laundry and pageout_queue flags are cleared...
1507                                  */
1508 #if CONFIG_EMBEDDED
1509                                 if (p->laundry)
1510                                         vm_pageout_throttle_up(p);
1511 #else
1512                                 vm_pageout_throttle_up(p);
1513 #endif
1514                                 if (p->pageout == TRUE) {
1515                                         /*
1516                                          * toss the wire count we picked up
1517                                          * when we initially set this page up
1518                                          * to be cleaned and stolen...
1519                                          */
1520                                         vm_page_unwire(p, TRUE);
1521                                         p->pageout = FALSE;
1522                                 }
1523                                 PAGE_WAKEUP(p);
1524
1525                         } else if (p->busy || p->cleaning) {
1526
1527                                 vm_page_unlock_queues();
1528                                 /*
1529                                  * free the pages reclaimed so far
1530                                  */
1531                                 VM_OBJ_REAP_FREELIST(local_free_q,
1532                                                      disconnect_on_release);
1533
1534                                 PAGE_SLEEP(object, p, THREAD_UNINT);
1535
1536                                 goto restart_after_sleep;
1537                         }
1538                 }
1539                 switch (reap_type) {
1540
1541                 case REAP_DATA_FLUSH:
1542                         if (VM_PAGE_WIRED(p)) {
1543                                 /*
1544                                  * this is an odd case... perhaps we should
1545                                  * zero-fill this page since we're conceptually
1546                                  * tossing its data at this point, but leaving
1547                                  * it on the object to honor the 'wire' contract
1548                                  */
1549                                 continue;
1550                         }
1551                         break;
1552
1553                 case REAP_PURGEABLE:
1554                         if (VM_PAGE_WIRED(p)) {
1555                                 /* can't purge a wired page */
1556                                 vm_page_purged_wired++;
1557                                 continue;
1558                         }
1559
1560                         if (p->busy) {
1561                                 /*
1562                                  * We can't reclaim a busy page but we can
1563                                  * make it pageable (it's not wired) to make
1564                                  * sure that it gets considered by
1565                                  * vm_pageout_scan() later.
1566                                  */
1567                                 vm_page_deactivate(p);
1568                                 vm_page_purged_busy++;
1569                                 continue;
1570                         }
1571
1572                         if (p->cleaning || p->laundry || p->list_req_pending) {
1573                                 /*
1574                                  * page is being acted upon,
1575                                  * so don't mess with it
1576                                  */
1577                                 vm_page_purged_others++;
1578                                 continue;
1579                         }
1580                         assert(p->object != kernel_object);
1581
1582                         /*
1583                          * we can discard this page...
1584                          */
1585                         if (p->pmapped == TRUE) {
1586                                 int refmod_state;
1587                                 /*
1588                                  * unmap the page
1589                                  */
1590                                 refmod_state = pmap_disconnect(p->phys_page);
1591                                 if (refmod_state & VM_MEM_MODIFIED) {
1592                                         p->dirty = TRUE;
1593                                 }
1594                         }
1595                         if (p->dirty || p->precious) {
1596                                 /*
1597                                  * we saved the cost of cleaning this page !
1598                                  */
1599                                 vm_page_purged_count++;
1600                         }
1601
1602                         break;
1603
1604                 case REAP_TERMINATE:
1605                         if (p->absent || p->private) {
1606                                 /*
1607                                  *      For private pages, VM_PAGE_FREE just
1608                                  *      leaves the page structure around for
1609                                  *      its owner to clean up.  For absent
1610                                  *      pages, the structure is returned to
1611                                  *      the appropriate pool.
1612                                  */
1613                                 break;
1614                         }
1615                         if (p->fictitious) {
1616                                 assert (p->phys_page == vm_page_guard_addr);
1617                                 break;
1618                         }
1619                         if (!p->dirty && p->wpmapped)
1620                                 p->dirty = pmap_is_modified(p->phys_page);
1621
1622                         if ((p->dirty || p->precious) && !p->error && object->alive) {
1623
1624                                 p->busy = TRUE;
1625
1626                                 VM_PAGE_QUEUES_REMOVE(p);
1627
1628                                 vm_page_unlock_queues();
1629                                 /*
1630                                  * free the pages reclaimed so far
1631                                  */
1632                                 VM_OBJ_REAP_FREELIST(local_free_q,
1633                                                      disconnect_on_release);
1634
1635                                 /*
1636                                  * flush page... page will be freed
1637                                  * upon completion of I/O
1638                                  */
1639                                 vm_pageout_cluster(p);
1640                                 vm_object_paging_wait(object, THREAD_UNINT);
1641
1642                                 goto restart_after_sleep;
1643                         }
1644                         break;
1645
1646                 case REAP_REAP:
1647                         break;
1648                 }
1649                 vm_page_free_prepare_queues(p);
1650                 assert(p->pageq.next == NULL && p->pageq.prev == NULL);
1651                 /*
1652                  * Add this page to our list of reclaimed pages,
1653                  * to be freed later.
1654                  */
1655                 p->pageq.next = (queue_entry_t) local_free_q;
1656                 local_free_q = p;
1657         }
1658         vm_page_unlock_queues();
1659
1660         /*
1661          * Free the remaining reclaimed pages
1662          */
1663         VM_OBJ_REAP_FREELIST(local_free_q,
1664                              disconnect_on_release);
1665 }
1666
1667
1668 void
1669 vm_object_reap_async(
1670         vm_object_t     object)
1671 {
1672         vm_object_lock_assert_exclusive(object);
1673
1674         vm_object_reaper_lock_spin();
1675
1676         vm_object_reap_count_async++;
1677
1678         /* enqueue the VM object... */
1679         queue_enter(&vm_object_reaper_queue, object,
1680                     vm_object_t, cached_list);
1681
1682         vm_object_reaper_unlock();
1683
1684         /* ... and wake up the reaper thread */
1685         thread_wakeup((event_t) &vm_object_reaper_queue);
1686 }
1687
1688
1689 void
1690 vm_object_reaper_thread(void)
1691 {
1692         vm_object_t     object, shadow_object;
1693
1694         vm_object_reaper_lock_spin();
1695
1696         while (!queue_empty(&vm_object_reaper_queue)) {
1697                 queue_remove_first(&vm_object_reaper_queue,
1698                                    object,
1699                                    vm_object_t,
1700                                    cached_list);
1701
1702                 vm_object_reaper_unlock();
1703                 vm_object_lock(object);
1704
1705                 assert(object->terminating);
1706                 assert(!object->alive);
1707
1708                 /*
1709                  * The pageout daemon might be playing with our pages.
1710                  * Now that the object is dead, it won't touch any more
1711                  * pages, but some pages might already be on their way out.
1712                  * Hence, we wait until the active paging activities have
1713                  * ceased before we break the association with the pager
1714                  * itself.
1715                  */
1716                 while (object->paging_in_progress != 0 ||
1717                         object->activity_in_progress != 0) {
1718                         vm_object_wait(object,
1719                                        VM_OBJECT_EVENT_PAGING_IN_PROGRESS,
1720                                        THREAD_UNINT);
1721                         vm_object_lock(object);
1722                 }
1723
1724                 shadow_object =
1725                         object->pageout ? VM_OBJECT_NULL : object->shadow;
1726
1727                 vm_object_reap(object);
1728                 /* cache is unlocked and object is no longer valid */
1729                 object = VM_OBJECT_NULL;
1730
1731                 if (shadow_object != VM_OBJECT_NULL) {
1732                         /*
1733                          * Drop the reference "object" was holding on
1734                          * its shadow object.
1735                          */
1736                         vm_object_deallocate(shadow_object);
1737                         shadow_object = VM_OBJECT_NULL;
1738                 }
1739                 vm_object_reaper_lock_spin();
1740         }
1741
1742         /* wait for more work... */
1743         assert_wait((event_t) &vm_object_reaper_queue, THREAD_UNINT);
1744
1745         vm_object_reaper_unlock();
1746
1747         thread_block((thread_continue_t) vm_object_reaper_thread);
1748         /*NOTREACHED*/
1749 }
1750
1751 /*
1752  *      Routine:        vm_object_pager_wakeup
1753  *      Purpose:        Wake up anyone waiting for termination of a pager.
1754  */
1755
1756 static void
1757 vm_object_pager_wakeup(
1758         memory_object_t pager)
1759 {
1760         vm_object_hash_entry_t  entry;
1761         boolean_t               waiting = FALSE;
1762         lck_mtx_t               *lck;
1763
1764         /*
1765          *      If anyone was waiting for the memory_object_terminate
1766          *      to be queued, wake them up now.
1767          */
1768         lck = vm_object_hash_lock_spin(pager);
1769         entry = vm_object_hash_lookup(pager, TRUE);
1770         if (entry != VM_OBJECT_HASH_ENTRY_NULL)
1771                 waiting = entry->waiting;
1772         vm_object_hash_unlock(lck);
1773
1774         if (entry != VM_OBJECT_HASH_ENTRY_NULL) {
1775                 if (waiting)
1776                         thread_wakeup((event_t) pager);
1777                 vm_object_hash_entry_free(entry);
1778         }
1779 }
1780
1781 /*
1782  *      Routine:        vm_object_release_pager
1783  *      Purpose:        Terminate the pager and, upon completion,
1784  *                      release our last reference to it.
1785  *                      just like memory_object_terminate, except
1786  *                      that we wake up anyone blocked in vm_object_enter
1787  *                      waiting for termination message to be queued
1788  *                      before calling memory_object_init.
1789  */
1790 static void
1791 vm_object_release_pager(
1792         memory_object_t pager,
1793         boolean_t       hashed)
1794 {
1795
1796         /*
1797          *      Terminate the pager.
1798          */
1799
1800         (void) memory_object_terminate(pager);
1801
1802         if (hashed == TRUE) {
1803                 /*
1804                  *      Wakeup anyone waiting for this terminate
1805                  *      and remove the entry from the hash
1806                  */
1807                 vm_object_pager_wakeup(pager);
1808         }
1809         /*
1810          *      Release reference to pager.
1811          */
1812         memory_object_deallocate(pager);
1813 }
1814
1815 /*
1816  *      Routine:        vm_object_destroy
1817  *      Purpose:
1818  *              Shut down a VM object, despite the
1819  *              presence of address map (or other) references
1820  *              to the vm_object.
1821  */
1822 kern_return_t
1823 vm_object_destroy(
1824         vm_object_t             object,
1825         __unused kern_return_t          reason)
1826 {
1827         memory_object_t         old_pager;
1828
1829         if (object == VM_OBJECT_NULL)
1830                 return(KERN_SUCCESS);
1831
1832         /*
1833          *      Remove the pager association immediately.
1834          *
1835          *      This will prevent the memory manager from further
1836          *      meddling.  [If it wanted to flush data or make
1837          *      other changes, it should have done so before performing
1838          *      the destroy call.]
1839          */
1840
1841         vm_object_lock(object);
1842         object->can_persist = FALSE;
1843         object->named = FALSE;
1844         object->alive = FALSE;
1845
1846         if (object->hashed) {
1847                 lck_mtx_t       *lck;
1848                 /*
1849                  *      Rip out the pager from the vm_object now...
1850                  */
1851                 lck = vm_object_hash_lock_spin(object->pager);
1852                 vm_object_remove(object);
1853                 vm_object_hash_unlock(lck);
1854         }
1855         old_pager = object->pager;
1856         object->pager = MEMORY_OBJECT_NULL;
1857         if (old_pager != MEMORY_OBJECT_NULL)
1858                 memory_object_control_disable(object->pager_control);
1859
1860         /*
1861          * Wait for the existing paging activity (that got
1862          * through before we nulled out the pager) to subside.
1863          */
1864
1865         vm_object_paging_wait(object, THREAD_UNINT);
1866         vm_object_unlock(object);
1867
1868         /*
1869          *      Terminate the object now.
1870          */
1871         if (old_pager != MEMORY_OBJECT_NULL) {
1872                 vm_object_release_pager(old_pager, object->hashed);
1873
1874                 /*
1875                  * JMM - Release the caller's reference.  This assumes the
1876                  * caller had a reference to release, which is a big (but
1877                  * currently valid) assumption if this is driven from the
1878                  * vnode pager (it is holding a named reference when making
1879                  * this call)..
1880                  */
1881                 vm_object_deallocate(object);
1882
1883         }
1884         return(KERN_SUCCESS);
1885 }
1886
1887
1888 #define VM_OBJ_DEACT_ALL_STATS DEBUG
1889 #if VM_OBJ_DEACT_ALL_STATS
1890 uint32_t vm_object_deactivate_all_pages_batches = 0;
1891 uint32_t vm_object_deactivate_all_pages_pages = 0;
1892 #endif /* VM_OBJ_DEACT_ALL_STATS */
1893 /*
1894  *      vm_object_deactivate_all_pages
1895  *
1896  *      Deactivate all pages in the specified object.  (Keep its pages
1897  *      in memory even though it is no longer referenced.)
1898  *
1899  *      The object must be locked.
1900  */
1901 static void
1902 vm_object_deactivate_all_pages(
1903         register vm_object_t    object)
1904 {
1905         register vm_page_t      p;
1906         int                     loop_count;
1907 #if VM_OBJ_DEACT_ALL_STATS
1908         int                     pages_count;
1909 #endif /* VM_OBJ_DEACT_ALL_STATS */
1910 #define V_O_D_A_P_MAX_BATCH     256
1911
1912         loop_count = V_O_D_A_P_MAX_BATCH;
1913 #if VM_OBJ_DEACT_ALL_STATS
1914         pages_count = 0;
1915 #endif /* VM_OBJ_DEACT_ALL_STATS */
1916         vm_page_lock_queues();
1917         queue_iterate(&object->memq, p, vm_page_t, listq) {
1918                 if (--loop_count == 0) {
1919 #if VM_OBJ_DEACT_ALL_STATS
1920                         hw_atomic_add(&vm_object_deactivate_all_pages_batches,
1921                                       1);
1922                         hw_atomic_add(&vm_object_deactivate_all_pages_pages,
1923                                       pages_count);
1924                         pages_count = 0;
1925 #endif /* VM_OBJ_DEACT_ALL_STATS */
1926                         lck_mtx_yield(&vm_page_queue_lock);
1927                         loop_count = V_O_D_A_P_MAX_BATCH;
1928                 }
1929                 if (!p->busy && !p->throttled) {
1930 #if VM_OBJ_DEACT_ALL_STATS
1931                         pages_count++;
1932 #endif /* VM_OBJ_DEACT_ALL_STATS */
1933                         vm_page_deactivate(p);
1934                 }
1935         }
1936 #if VM_OBJ_DEACT_ALL_STATS
1937         if (pages_count) {
1938                 hw_atomic_add(&vm_object_deactivate_all_pages_batches, 1);
1939                 hw_atomic_add(&vm_object_deactivate_all_pages_pages,
1940                               pages_count);
1941                 pages_count = 0;
1942         }
1943 #endif /* VM_OBJ_DEACT_ALL_STATS */
1944         vm_page_unlock_queues();
1945 }
1946
1947
1948
1949 /*
1950  * when deallocating pages it is necessary to hold
1951  * the vm_page_queue_lock (a hot global lock) for certain operations
1952  * on the page... however, the majority of the work can be done
1953  * while merely holding the object lock... to mitigate the time spent behind the
1954  * global lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT
1955  * while doing all of the work that doesn't require the vm_page_queue_lock...
1956  * them call dw_do_work to acquire the vm_page_queue_lock and do the
1957  * necessary work for each page... we will grab the busy bit on the page
1958  * so that dw_do_work can drop the object lock if it can't immediately take the
1959  * vm_page_queue_lock in order to compete for the locks in the same order that
1960  * vm_pageout_scan takes them.
1961  */
1962
1963 #define DELAYED_WORK_LIMIT      32
1964
1965 #define DW_clear_reference      0x01
1966 #define DW_move_page            0x02
1967 #define DW_clear_busy           0x04
1968 #define DW_PAGE_WAKEUP          0x08
1969
1970
1971 struct dw {
1972         vm_page_t       dw_m;
1973         int             dw_mask;
1974 };
1975
1976 static void dw_do_work(vm_object_t object, struct dw *dwp, int dw_count);
1977
1978
1979 static void
1980 dw_do_work(
1981         vm_object_t     object,
1982         struct dw       *dwp,
1983         int             dw_count)
1984 {
1985         vm_page_t       m;
1986         int             j;
1987
1988         /*
1989          * pageout_scan takes the vm_page_lock_queues first
1990          * then tries for the object lock... to avoid what
1991          * is effectively a lock inversion, we'll go to the
1992          * trouble of taking them in that same order... otherwise
1993          * if this object contains the majority of the pages resident
1994          * in the UBC (or a small set of large objects actively being
1995          * worked on contain the majority of the pages), we could
1996          * cause the pageout_scan thread to 'starve' in its attempt
1997          * to find pages to move to the free queue, since it has to
1998          * successfully acquire the object lock of any candidate page
1999          * before it can steal/clean it.
2000          */
2001         if (!vm_page_trylockspin_queues()) {
2002                 vm_object_unlock(object);
2003
2004                 vm_page_lockspin_queues();
2005
2006                 for (j = 0; ; j++) {
2007                         if (!vm_object_lock_avoid(object) &&
2008                             _vm_object_lock_try(object))
2009                                 break;
2010                         vm_page_unlock_queues();
2011                         mutex_pause(j);
2012                         vm_page_lockspin_queues();
2013                 }
2014         }
2015         for (j = 0; j < dw_count; j++, dwp++) {
2016
2017                 m = dwp->dw_m;
2018
2019                 if (dwp->dw_mask & DW_clear_reference)
2020                         m->reference = FALSE;
2021
2022                 if (dwp->dw_mask & DW_move_page) {
2023                         VM_PAGE_QUEUES_REMOVE(m);
2024
2025                         assert(!m->laundry);
2026                         assert(m->object != kernel_object);
2027                         assert(m->pageq.next == NULL &&
2028                                m->pageq.prev == NULL);
2029
2030                         if (m->zero_fill) {
2031                                 queue_enter_first(&vm_page_queue_zf, m, vm_page_t, pageq);
2032                                 vm_zf_queue_count++;
2033                         } else {
2034                                 queue_enter_first(&vm_page_queue_inactive, m, vm_page_t, pageq);
2035                         }
2036                         m->inactive = TRUE;
2037
2038                         if (!m->fictitious) {
2039                                 vm_page_inactive_count++;
2040                                 token_new_pagecount++;
2041                         } else {
2042                                 assert(m->phys_page == vm_page_fictitious_addr);
2043                         }
2044                 }
2045                 if (dwp->dw_mask & DW_clear_busy)
2046                         dwp->dw_m->busy = FALSE;
2047
2048                 if (dwp->dw_mask & DW_PAGE_WAKEUP)
2049                         PAGE_WAKEUP(dwp->dw_m);
2050         }
2051         vm_page_unlock_queues();
2052
2053 #if CONFIG_EMBEDDED
2054         {
2055         int percent_avail;
2056
2057         /*
2058          * Decide if we need to send a memory status notification.
2059          */
2060         percent_avail =
2061                 (vm_page_active_count + vm_page_inactive_count +
2062                  vm_page_speculative_count + vm_page_free_count +
2063                  (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 /
2064                 atop_64(max_mem);
2065         if (percent_avail >= (kern_memorystatus_level + 5) ||
2066             percent_avail <= (kern_memorystatus_level - 5)) {
2067                 kern_memorystatus_level = percent_avail;
2068                 thread_wakeup((event_t)&kern_memorystatus_wakeup);
2069         }
2070         }
2071 #endif
2072 }
2073
2074
2075
2076 /*
2077  * The "chunk" macros are used by routines below when looking for pages to deactivate.  These
2078  * exist because of the need to handle shadow chains.  When deactivating pages, we only
2079  * want to deactive the ones at the top most level in the object chain.  In order to do
2080  * this efficiently, the specified address range is divided up into "chunks" and we use
2081  * a bit map to keep track of which pages have already been processed as we descend down
2082  * the shadow chain.  These chunk macros hide the details of the bit map implementation
2083  * as much as we can.
2084  *
2085  * For convenience, we use a 64-bit data type as the bit map, and therefore a chunk is
2086  * set to 64 pages.  The bit map is indexed from the low-order end, so that the lowest
2087  * order bit represents page 0 in the current range and highest order bit represents
2088  * page 63.
2089  *
2090  * For further convenience, we also use negative logic for the page state in the bit map.
2091  * The bit is set to 1 to indicate it has not yet been seen, and to 0 to indicate it has
2092  * been processed.  This way we can simply test the 64-bit long word to see if it's zero
2093  * to easily tell if the whole range has been processed.  Therefore, the bit map starts
2094  * out with all the bits set.  The macros below hide all these details from the caller.
2095  */
2096
2097 #define PAGES_IN_A_CHUNK        64      /* The number of pages in the chunk must */
2098                                         /* be the same as the number of bits in  */
2099                                         /* the chunk_state_t type. We use 64     */
2100                                         /* just for convenience.                 */
2101
2102 #define CHUNK_SIZE      (PAGES_IN_A_CHUNK * PAGE_SIZE_64)       /* Size of a chunk in bytes */
2103
2104 typedef uint64_t        chunk_state_t;
2105
2106 /*
2107  * The bit map uses negative logic, so we start out with all 64 bits set to indicate
2108  * that no pages have been processed yet.  Also, if len is less than the full CHUNK_SIZE,
2109  * then we mark pages beyond the len as having been "processed" so that we don't waste time
2110  * looking at pages in that range.  This can save us from unnecessarily chasing down the
2111  * shadow chain.
2112  */
2113
2114 #define CHUNK_INIT(c, len)                                              \
2115         MACRO_BEGIN                                                     \
2116         uint64_t p;                                                     \
2117                                                                         \
2118         (c) = 0xffffffffffffffffLL;                                     \
2119                                                                         \
2120         for (p = (len) / PAGE_SIZE_64; p < PAGES_IN_A_CHUNK; p++)       \
2121                 MARK_PAGE_HANDLED(c, p);                                \
2122         MACRO_END
2123
2124 /*
2125  * Return true if all pages in the chunk have not yet been processed.
2126  */
2127
2128 #define CHUNK_NOT_COMPLETE(c)   ((c) != 0)
2129
2130 /*
2131  * Return true if the page at offset 'p' in the bit map has already been handled
2132  * while processing a higher level object in the shadow chain.
2133  */
2134
2135 #define PAGE_ALREADY_HANDLED(c, p)      (((c) & (1LL << (p))) == 0)
2136
2137 /*
2138  * Mark the page at offset 'p' in the bit map as having been processed.
2139  */
2140
2141 #define MARK_PAGE_HANDLED(c, p) \
2142 MACRO_BEGIN \
2143         (c) = (c) & ~(1LL << (p)); \
2144 MACRO_END
2145
2146
2147 /*
2148  * Return true if the page at the given offset has been paged out.  Object is
2149  * locked upon entry and returned locked.
2150  */
2151
2152 static boolean_t
2153 page_is_paged_out(
2154         vm_object_t             object,
2155         vm_object_offset_t      offset)
2156 {
2157         kern_return_t   kr;
2158         memory_object_t pager;
2159
2160         /*
2161          * Check the existence map for the page if we have one, otherwise
2162          * ask the pager about this page.
2163          */
2164
2165 #if MACH_PAGEMAP
2166         if (object->existence_map) {
2167                 if (vm_external_state_get(object->existence_map, offset)
2168                     == VM_EXTERNAL_STATE_EXISTS) {
2169                         /*
2170                          * We found the page
2171                          */
2172
2173                         return TRUE;
2174                 }
2175         } else
2176 #endif
2177                 if (object->internal &&
2178                    object->alive &&
2179                    !object->terminating &&
2180                    object->pager_ready) {
2181
2182                 /*
2183                  * We're already holding a "paging in progress" reference
2184                  * so the object can't disappear when we release the lock.
2185                  */
2186
2187                 assert(object->paging_in_progress);
2188                 pager = object->pager;
2189                 vm_object_unlock(object);
2190
2191                 kr = memory_object_data_request(
2192                         pager,
2193                         offset + object->paging_offset,
2194                         0,      /* just poke the pager */
2195                         VM_PROT_READ,
2196                         NULL);
2197
2198                 vm_object_lock(object);
2199
2200                 if (kr == KERN_SUCCESS) {
2201
2202                         /*
2203                          * We found the page
2204                          */
2205
2206                         return TRUE;
2207                 }
2208         }
2209
2210         return FALSE;
2211 }
2212
2213
2214 /*
2215  * Deactivate the pages in the specified object and range.  If kill_page is set, also discard any
2216  * page modified state from the pmap.  Update the chunk_state as we go along.  The caller must specify
2217  * a size that is less than or equal to the CHUNK_SIZE.
2218  */
2219
2220 static void
2221 deactivate_pages_in_object(
2222         vm_object_t             object,
2223         vm_object_offset_t      offset,
2224         vm_object_size_t        size,
2225         boolean_t               kill_page,
2226         boolean_t               reusable_page,
2227 #if !MACH_ASSERT
2228         __unused
2229 #endif
2230         boolean_t               all_reusable,
2231         chunk_state_t           *chunk_state)
2232 {
2233         vm_page_t       m;
2234         int             p;
2235         struct  dw      dw_array[DELAYED_WORK_LIMIT];
2236         struct  dw      *dwp;
2237         int             dw_count;
2238         unsigned int    reusable = 0;
2239
2240
2241         /*
2242          * Examine each page in the chunk.  The variable 'p' is the page number relative to the start of the
2243          * chunk.  Since this routine is called once for each level in the shadow chain, the chunk_state may
2244          * have pages marked as having been processed already.  We stop the loop early if we find we've handled
2245          * all the pages in the chunk.
2246          */
2247
2248         dwp = &dw_array[0];
2249         dw_count = 0;
2250
2251         for(p = 0; size && CHUNK_NOT_COMPLETE(*chunk_state); p++, size -= PAGE_SIZE_64, offset += PAGE_SIZE_64) {
2252
2253                 /*
2254                  * If this offset has already been found and handled in a higher level object, then don't
2255                  * do anything with it in the current shadow object.
2256                  */
2257
2258                 if (PAGE_ALREADY_HANDLED(*chunk_state, p))
2259                         continue;
2260
2261                 /*
2262                  * See if the page at this offset is around.  First check to see if the page is resident,
2263                  * then if not, check the existence map or with the pager.
2264                  */
2265
2266                 if ((m = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
2267
2268                         /*
2269                          * We found a page we were looking for.  Mark it as "handled" now in the chunk_state
2270                          * so that we won't bother looking for a page at this offset again if there are more
2271                          * shadow objects.  Then deactivate the page.
2272                          */
2273
2274                         MARK_PAGE_HANDLED(*chunk_state, p);
2275
2276                         if (( !VM_PAGE_WIRED(m)) && (!m->private) && (!m->gobbled) && (!m->busy)) {
2277                                 int     clear_refmod;
2278
2279                                 assert(!m->laundry);
2280
2281                                 clear_refmod = VM_MEM_REFERENCED;
2282                                 dwp->dw_mask = DW_clear_reference;
2283
2284                                 if ((kill_page) && (object->internal)) {
2285                                         m->precious = FALSE;
2286                                         m->dirty = FALSE;
2287
2288                                         clear_refmod |= VM_MEM_MODIFIED;
2289                                         if (m->throttled) {
2290                                                 /*
2291                                                  * This page is now clean and
2292                                                  * reclaimable.  Move it out
2293                                                  * of the throttled queue, so
2294                                                  * that vm_pageout_scan() can
2295                                                  * find it.
2296                                                  */
2297                                                 dwp->dw_mask |= DW_move_page;
2298                                         }
2299 #if     MACH_PAGEMAP
2300                                         vm_external_state_clr(object->existence_map, offset);
2301 #endif  /* MACH_PAGEMAP */
2302
2303                                         if (reusable_page && !m->reusable) {
2304                                                 assert(!all_reusable);
2305                                                 assert(!object->all_reusable);
2306                                                 m->reusable = TRUE;
2307                                                 object->reusable_page_count++;
2308                                                 assert(object->resident_page_count >= object->reusable_page_count);
2309                                                 reusable++;
2310 #if CONFIG_EMBEDDED
2311                                         } else {
2312                                                 if (m->reusable) {
2313                                                         m->reusable = FALSE;
2314                                                         object->reusable_page_count--;
2315                                                 }
2316 #endif
2317                                         }
2318                                 }
2319                                 pmap_clear_refmod(m->phys_page, clear_refmod);
2320
2321                                 if (!m->throttled && !(reusable_page || all_reusable))
2322                                         dwp->dw_mask |= DW_move_page;
2323                                 /*
2324                                  * dw_do_work may need to drop the object lock
2325                                  * if it does, we need the pages its looking at to
2326                                  * be held stable via the busy bit.
2327                                  */
2328                                 m->busy = TRUE;
2329                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
2330
2331                                 dwp->dw_m = m;
2332                                 dwp++;
2333                                 dw_count++;
2334
2335                                 if (dw_count >= DELAYED_WORK_LIMIT) {
2336                                         if (reusable) {
2337                                                 OSAddAtomic(reusable,
2338                                                             &vm_page_stats_reusable.reusable_count);
2339                                                 vm_page_stats_reusable.reusable += reusable;
2340                                                 reusable = 0;
2341                                         }
2342                                         dw_do_work(object, &dw_array[0], dw_count);
2343
2344                                         dwp = &dw_array[0];
2345                                         dw_count = 0;
2346                                 }
2347                         }
2348
2349                 } else {
2350
2351                         /*
2352                          * The page at this offset isn't memory resident, check to see if it's
2353                          * been paged out.  If so, mark it as handled so we don't bother looking
2354                          * for it in the shadow chain.
2355                          */
2356
2357                         if (page_is_paged_out(object, offset)) {
2358                                 MARK_PAGE_HANDLED(*chunk_state, p);
2359
2360                                 /*
2361                                  * If we're killing a non-resident page, then clear the page in the existence
2362                                  * map so we don't bother paging it back in if it's touched again in the future.
2363                                  */
2364
2365                                 if ((kill_page) && (object->internal)) {
2366 #if     MACH_PAGEMAP
2367                                         vm_external_state_clr(object->existence_map, offset);
2368 #endif  /* MACH_PAGEMAP */
2369                                 }
2370                         }
2371                 }
2372         }
2373
2374         if (reusable) {
2375                 OSAddAtomic(reusable, &vm_page_stats_reusable.reusable_count);
2376                 vm_page_stats_reusable.reusable += reusable;
2377                 reusable = 0;
2378         }
2379
2380         if (dw_count)
2381                 dw_do_work(object, &dw_array[0], dw_count);
2382 }
2383
2384
2385 /*
2386  * Deactive a "chunk" of the given range of the object starting at offset.  A "chunk"
2387  * will always be less than or equal to the given size.  The total range is divided up
2388  * into chunks for efficiency and performance related to the locks and handling the shadow
2389  * chain.  This routine returns how much of the given "size" it actually processed.  It's
2390  * up to the caler to loop and keep calling this routine until the entire range they want
2391  * to process has been done.
2392  */
2393
2394 static vm_object_size_t
2395 deactivate_a_chunk(
2396         vm_object_t             orig_object,
2397         vm_object_offset_t      offset,
2398         vm_object_size_t        size,
2399         boolean_t               kill_page,
2400         boolean_t               reusable_page,
2401         boolean_t               all_reusable)
2402 {
2403         vm_object_t             object;
2404         vm_object_t             tmp_object;
2405         vm_object_size_t        length;
2406         chunk_state_t           chunk_state;
2407
2408
2409         /*
2410          * Get set to do a chunk.  We'll do up to CHUNK_SIZE, but no more than the
2411          * remaining size the caller asked for.
2412          */
2413
2414         length = MIN(size, CHUNK_SIZE);
2415
2416         /*
2417          * The chunk_state keeps track of which pages we've already processed if there's
2418          * a shadow chain on this object.  At this point, we haven't done anything with this
2419          * range of pages yet, so initialize the state to indicate no pages processed yet.
2420          */
2421
2422         CHUNK_INIT(chunk_state, length);
2423         object = orig_object;
2424
2425         /*
2426          * Start at the top level object and iterate around the loop once for each object
2427          * in the shadow chain.  We stop processing early if we've already found all the pages
2428          * in the range.  Otherwise we stop when we run out of shadow objects.
2429          */
2430
2431         while (object && CHUNK_NOT_COMPLETE(chunk_state)) {
2432                 vm_object_paging_begin(object);
2433
2434                 deactivate_pages_in_object(object, offset, length, kill_page, reusable_page, all_reusable, &chunk_state);
2435
2436                 vm_object_paging_end(object);
2437
2438                 /*
2439                  * We've finished with this object, see if there's a shadow object.  If
2440                  * there is, update the offset and lock the new object.  We also turn off
2441                  * kill_page at this point since we only kill pages in the top most object.
2442                  */
2443
2444                 tmp_object = object->shadow;
2445
2446                 if (tmp_object) {
2447                         kill_page = FALSE;
2448                         reusable_page = FALSE;
2449                         all_reusable = FALSE;
2450                         offset += object->shadow_offset;
2451                         vm_object_lock(tmp_object);
2452                 }
2453
2454                 if (object != orig_object)
2455                         vm_object_unlock(object);
2456
2457                 object = tmp_object;
2458         }
2459
2460         if (object && object != orig_object)
2461                 vm_object_unlock(object);
2462
2463         return length;
2464 }
2465
2466
2467
2468 /*
2469  * Move any resident pages in the specified range to the inactive queue.  If kill_page is set,
2470  * we also clear the modified status of the page and "forget" any changes that have been made
2471  * to the page.
2472  */
2473
2474 __private_extern__ void
2475 vm_object_deactivate_pages(
2476         vm_object_t             object,
2477         vm_object_offset_t      offset,
2478         vm_object_size_t        size,
2479         boolean_t               kill_page,
2480         boolean_t               reusable_page)
2481 {
2482         vm_object_size_t        length;
2483         boolean_t               all_reusable;
2484
2485         /*
2486          * We break the range up into chunks and do one chunk at a time.  This is for
2487          * efficiency and performance while handling the shadow chains and the locks.
2488          * The deactivate_a_chunk() function returns how much of the range it processed.
2489          * We keep calling this routine until the given size is exhausted.
2490          */
2491
2492
2493         all_reusable = FALSE;
2494         if (reusable_page &&
2495             object->size != 0 &&
2496             object->size == size &&
2497             object->reusable_page_count == 0) {
2498                 all_reusable = TRUE;
2499                 reusable_page = FALSE;
2500         }
2501
2502 #if CONFIG_EMBEDDED
2503         if ((reusable_page || all_reusable) && object->all_reusable) {
2504                 /* This means MADV_FREE_REUSABLE has been called twice, which
2505                  * is probably illegal. */
2506                 return;
2507         }
2508 #endif
2509
2510         while (size) {
2511                 length = deactivate_a_chunk(object, offset, size, kill_page, reusable_page, all_reusable);
2512
2513                 size -= length;
2514                 offset += length;
2515         }
2516
2517         if (all_reusable) {
2518                 if (!object->all_reusable) {
2519                         unsigned int reusable;
2520
2521                         object->all_reusable = TRUE;
2522                         assert(object->reusable_page_count == 0);
2523                         /* update global stats */
2524                         reusable = object->resident_page_count;
2525                         OSAddAtomic(reusable,
2526                                     &vm_page_stats_reusable.reusable_count);
2527                         vm_page_stats_reusable.reusable += reusable;
2528                         vm_page_stats_reusable.all_reusable_calls++;
2529                 }
2530         } else if (reusable_page) {
2531                 vm_page_stats_reusable.partial_reusable_calls++;
2532         }
2533 }
2534
2535 void
2536 vm_object_reuse_pages(
2537         vm_object_t             object,
2538         vm_object_offset_t      start_offset,
2539         vm_object_offset_t      end_offset,
2540         boolean_t               allow_partial_reuse)
2541 {
2542         vm_object_offset_t      cur_offset;
2543         vm_page_t               m;
2544         unsigned int            reused, reusable;
2545
2546 #define VM_OBJECT_REUSE_PAGE(object, m, reused)                         \
2547         MACRO_BEGIN                                                     \
2548                 if ((m) != VM_PAGE_NULL &&                              \
2549                     (m)->reusable) {                                    \
2550                         assert((object)->reusable_page_count <=         \
2551                                (object)->resident_page_count);          \
2552                         assert((object)->reusable_page_count > 0);      \
2553                         (object)->reusable_page_count--;                \
2554                         (m)->reusable = FALSE;                          \
2555                         (reused)++;                                     \
2556                 }                                                       \
2557         MACRO_END
2558
2559         reused = 0;
2560         reusable = 0;
2561
2562         vm_object_lock_assert_exclusive(object);
2563
2564         if (object->all_reusable) {
2565                 assert(object->reusable_page_count == 0);
2566                 object->all_reusable = FALSE;
2567                 if (end_offset - start_offset == object->size ||
2568                     !allow_partial_reuse) {
2569                         vm_page_stats_reusable.all_reuse_calls++;
2570                         reused = object->resident_page_count;
2571                 } else {
2572                         vm_page_stats_reusable.partial_reuse_calls++;
2573                         queue_iterate(&object->memq, m, vm_page_t, listq) {
2574                                 if (m->offset < start_offset ||
2575                                     m->offset >= end_offset) {
2576                                         m->reusable = TRUE;
2577                                         object->reusable_page_count++;
2578                                         assert(object->resident_page_count >= object->reusable_page_count);
2579                                         continue;
2580                                 } else {
2581                                         assert(!m->reusable);
2582                                         reused++;
2583                                 }
2584                         }
2585                 }
2586         } else if (object->resident_page_count >
2587                    ((end_offset - start_offset) >> PAGE_SHIFT)) {
2588                 vm_page_stats_reusable.partial_reuse_calls++;
2589                 for (cur_offset = start_offset;
2590                      cur_offset < end_offset;
2591                      cur_offset += PAGE_SIZE_64) {
2592                         if (object->reusable_page_count == 0) {
2593                                 break;
2594                         }
2595                         m = vm_page_lookup(object, cur_offset);
2596                         VM_OBJECT_REUSE_PAGE(object, m, reused);
2597                 }
2598         } else {
2599                 vm_page_stats_reusable.partial_reuse_calls++;
2600                 queue_iterate(&object->memq, m, vm_page_t, listq) {
2601                         if (object->reusable_page_count == 0) {
2602                                 break;
2603                         }
2604                         if (m->offset < start_offset ||
2605                             m->offset >= end_offset) {
2606                                 continue;
2607                         }
2608                         VM_OBJECT_REUSE_PAGE(object, m, reused);
2609                 }
2610         }
2611
2612         /* update global stats */
2613         OSAddAtomic(reusable-reused, &vm_page_stats_reusable.reusable_count);
2614         vm_page_stats_reusable.reused += reused;
2615         vm_page_stats_reusable.reusable += reusable;
2616 }
2617
2618 /*
2619  *      Routine:        vm_object_pmap_protect
2620  *
2621  *      Purpose:
2622  *              Reduces the permission for all physical
2623  *              pages in the specified object range.
2624  *
2625  *              If removing write permission only, it is
2626  *              sufficient to protect only the pages in
2627  *              the top-level object; only those pages may
2628  *              have write permission.
2629  *
2630  *              If removing all access, we must follow the
2631  *              shadow chain from the top-level object to
2632  *              remove access to all pages in shadowed objects.
2633  *
2634  *              The object must *not* be locked.  The object must
2635  *              be temporary/internal.
2636  *
2637  *              If pmap is not NULL, this routine assumes that
2638  *              the only mappings for the pages are in that
2639  *              pmap.
2640  */
2641
2642 __private_extern__ void
2643 vm_object_pmap_protect(
2644         register vm_object_t            object,
2645         register vm_object_offset_t     offset,
2646         vm_object_size_t                size,
2647         pmap_t                          pmap,
2648         vm_map_offset_t                 pmap_start,
2649         vm_prot_t                       prot)
2650 {
2651         if (object == VM_OBJECT_NULL)
2652             return;
2653         size = vm_object_round_page(size);
2654         offset = vm_object_trunc_page(offset);
2655
2656         vm_object_lock(object);
2657
2658         if (object->phys_contiguous) {
2659                 if (pmap != NULL) {
2660                         vm_object_unlock(object);
2661                         pmap_protect(pmap, pmap_start, pmap_start + size, prot);
2662                 } else {
2663                         vm_object_offset_t phys_start, phys_end, phys_addr;
2664
2665                         phys_start = object->shadow_offset + offset;
2666                         phys_end = phys_start + size;
2667                         assert(phys_start <= phys_end);
2668                         assert(phys_end <= object->shadow_offset + object->size);
2669                         vm_object_unlock(object);
2670
2671                         for (phys_addr = phys_start;
2672                              phys_addr < phys_end;
2673                              phys_addr += PAGE_SIZE_64) {
2674                                 pmap_page_protect((ppnum_t) (phys_addr >> PAGE_SHIFT), prot);
2675                         }
2676                 }
2677                 return;
2678         }
2679
2680         assert(object->internal);
2681
2682         while (TRUE) {
2683            if (ptoa_64(object->resident_page_count) > size/2 && pmap != PMAP_NULL) {
2684                 vm_object_unlock(object);
2685                 pmap_protect(pmap, pmap_start, pmap_start + size, prot);
2686                 return;
2687             }
2688
2689             /* if we are doing large ranges with respect to resident */
2690             /* page count then we should interate over pages otherwise */
2691             /* inverse page look-up will be faster */
2692             if (ptoa_64(object->resident_page_count / 4) <  size) {
2693                 vm_page_t               p;
2694                 vm_object_offset_t      end;
2695
2696                 end = offset + size;
2697
2698                 if (pmap != PMAP_NULL) {
2699                   queue_iterate(&object->memq, p, vm_page_t, listq) {
2700                     if (!p->fictitious &&
2701                         (offset <= p->offset) && (p->offset < end)) {
2702                         vm_map_offset_t start;
2703
2704                         start = pmap_start + p->offset - offset;
2705                         pmap_protect(pmap, start, start + PAGE_SIZE_64, prot);
2706                     }
2707                   }
2708                 } else {
2709                   queue_iterate(&object->memq, p, vm_page_t, listq) {
2710                     if (!p->fictitious &&
2711                         (offset <= p->offset) && (p->offset < end)) {
2712
2713                         pmap_page_protect(p->phys_page, prot);
2714                     }
2715                   }
2716                 }
2717            } else {
2718                 vm_page_t               p;
2719                 vm_object_offset_t      end;
2720                 vm_object_offset_t      target_off;
2721
2722                 end = offset + size;
2723
2724                 if (pmap != PMAP_NULL) {
2725                         for(target_off = offset;
2726                             target_off < end;
2727                             target_off += PAGE_SIZE) {
2728                                 p = vm_page_lookup(object, target_off);
2729                                 if (p != VM_PAGE_NULL) {
2730                                         vm_object_offset_t start;
2731                                         start = pmap_start +
2732                                                 (p->offset - offset);
2733                                         pmap_protect(pmap, start,
2734                                                      start + PAGE_SIZE, prot);
2735                                 }
2736                         }
2737                 } else {
2738                         for(target_off = offset;
2739                                 target_off < end; target_off += PAGE_SIZE) {
2740                                 p = vm_page_lookup(object, target_off);
2741                                 if (p != VM_PAGE_NULL) {
2742                                         pmap_page_protect(p->phys_page, prot);
2743                                 }
2744                         }
2745                 }
2746           }
2747
2748             if (prot == VM_PROT_NONE) {
2749                 /*
2750                  * Must follow shadow chain to remove access
2751                  * to pages in shadowed objects.
2752                  */
2753                 register vm_object_t    next_object;
2754
2755                 next_object = object->shadow;
2756                 if (next_object != VM_OBJECT_NULL) {
2757                     offset += object->shadow_offset;
2758                     vm_object_lock(next_object);
2759                     vm_object_unlock(object);
2760                     object = next_object;
2761                 }
2762                 else {
2763                     /*
2764                      * End of chain - we are done.
2765                      */
2766                     break;
2767                 }
2768             }
2769             else {
2770                 /*
2771                  * Pages in shadowed objects may never have
2772                  * write permission - we may stop here.
2773                  */
2774                 break;
2775             }
2776         }
2777
2778         vm_object_unlock(object);
2779 }
2780
2781 /*
2782  *      Routine:        vm_object_copy_slowly
2783  *
2784  *      Description:
2785  *              Copy the specified range of the source
2786  *              virtual memory object without using
2787  *              protection-based optimizations (such
2788  *              as copy-on-write).  The pages in the
2789  *              region are actually copied.
2790  *
2791  *      In/out conditions:
2792  *              The caller must hold a reference and a lock
2793  *              for the source virtual memory object.  The source
2794  *              object will be returned *unlocked*.
2795  *
2796  *      Results:
2797  *              If the copy is completed successfully, KERN_SUCCESS is
2798  *              returned.  If the caller asserted the interruptible
2799  *              argument, and an interruption occurred while waiting
2800  *              for a user-generated event, MACH_SEND_INTERRUPTED is
2801  *              returned.  Other values may be returned to indicate
2802  *              hard errors during the copy operation.
2803  *
2804  *              A new virtual memory object is returned in a
2805  *              parameter (_result_object).  The contents of this
2806  *              new object, starting at a zero offset, are a copy
2807  *              of the source memory region.  In the event of
2808  *              an error, this parameter will contain the value
2809  *              VM_OBJECT_NULL.
2810  */
2811 __private_extern__ kern_return_t
2812 vm_object_copy_slowly(
2813         register vm_object_t    src_object,
2814         vm_object_offset_t      src_offset,
2815         vm_object_size_t        size,
2816         boolean_t               interruptible,
2817         vm_object_t             *_result_object)        /* OUT */
2818 {
2819         vm_object_t             new_object;
2820         vm_object_offset_t      new_offset;
2821
2822         struct vm_object_fault_info fault_info;
2823
2824         XPR(XPR_VM_OBJECT, "v_o_c_slowly obj 0x%x off 0x%x size 0x%x\n",
2825             src_object, src_offset, size, 0, 0);
2826
2827         if (size == 0) {
2828                 vm_object_unlock(src_object);
2829                 *_result_object = VM_OBJECT_NULL;
2830                 return(KERN_INVALID_ARGUMENT);
2831         }
2832
2833         /*
2834          *      Prevent destruction of the source object while we copy.
2835          */
2836
2837         vm_object_reference_locked(src_object);
2838         vm_object_unlock(src_object);
2839
2840         /*
2841          *      Create a new object to hold the copied pages.
2842          *      A few notes:
2843          *              We fill the new object starting at offset 0,
2844          *               regardless of the input offset.
2845          *              We don't bother to lock the new object within
2846          *               this routine, since we have the only reference.
2847          */
2848
2849         new_object = vm_object_allocate(size);
2850         new_offset = 0;
2851
2852         assert(size == trunc_page_64(size));    /* Will the loop terminate? */
2853
2854         fault_info.interruptible = interruptible;
2855         fault_info.behavior  = VM_BEHAVIOR_SEQUENTIAL;
2856         fault_info.user_tag  = 0;
2857         fault_info.lo_offset = src_offset;
2858         fault_info.hi_offset = src_offset + size;
2859         fault_info.no_cache  = FALSE;
2860         fault_info.stealth = TRUE;
2861         fault_info.mark_zf_absent = FALSE;
2862
2863         for ( ;
2864             size != 0 ;
2865             src_offset += PAGE_SIZE_64,
2866                         new_offset += PAGE_SIZE_64, size -= PAGE_SIZE_64
2867             ) {
2868                 vm_page_t       new_page;
2869                 vm_fault_return_t result;
2870
2871                 vm_object_lock(new_object);
2872
2873                 while ((new_page = vm_page_alloc(new_object, new_offset))
2874                                 == VM_PAGE_NULL) {
2875
2876                         vm_object_unlock(new_object);
2877
2878                         if (!vm_page_wait(interruptible)) {
2879                                 vm_object_deallocate(new_object);
2880                                 vm_object_deallocate(src_object);
2881                                 *_result_object = VM_OBJECT_NULL;
2882                                 return(MACH_SEND_INTERRUPTED);
2883                         }
2884                         vm_object_lock(new_object);
2885                 }
2886                 vm_object_unlock(new_object);
2887
2888                 do {
2889                         vm_prot_t       prot = VM_PROT_READ;
2890                         vm_page_t       _result_page;
2891                         vm_page_t       top_page;
2892                         register
2893                         vm_page_t       result_page;
2894                         kern_return_t   error_code;
2895
2896                         vm_object_lock(src_object);
2897                         vm_object_paging_begin(src_object);
2898
2899                         if (size > (vm_size_t) -1) {
2900                                 /* 32-bit overflow */
2901                                 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
2902                         } else {
2903                                 fault_info.cluster_size = (vm_size_t) size;
2904                                 assert(fault_info.cluster_size == size);
2905                         }
2906
2907                         XPR(XPR_VM_FAULT,"vm_object_copy_slowly -> vm_fault_page",0,0,0,0,0);
2908                         result = vm_fault_page(src_object, src_offset,
2909                                 VM_PROT_READ, FALSE,
2910                                 &prot, &_result_page, &top_page,
2911                                 (int *)0,
2912                                 &error_code, FALSE, FALSE, &fault_info);
2913
2914                         switch(result) {
2915                         case VM_FAULT_SUCCESS:
2916                                 result_page = _result_page;
2917
2918                                 /*
2919                                  *      We don't need to hold the object
2920                                  *      lock -- the busy page will be enough.
2921                                  *      [We don't care about picking up any
2922                                  *      new modifications.]
2923                                  *
2924                                  *      Copy the page to the new object.
2925                                  *
2926                                  *      POLICY DECISION:
2927                                  *              If result_page is clean,
2928                                  *              we could steal it instead
2929                                  *              of copying.
2930                                  */
2931
2932                                 vm_object_unlock(result_page->object);
2933                                 vm_page_copy(result_page, new_page);
2934
2935                                 /*
2936                                  *      Let go of both pages (make them
2937                                  *      not busy, perform wakeup, activate).
2938                                  */
2939                                 vm_object_lock(new_object);
2940                                 new_page->dirty = TRUE;
2941                                 PAGE_WAKEUP_DONE(new_page);
2942                                 vm_object_unlock(new_object);
2943
2944                                 vm_object_lock(result_page->object);
2945                                 PAGE_WAKEUP_DONE(result_page);
2946
2947                                 vm_page_lockspin_queues();
2948                                 if (!result_page->active &&
2949                                     !result_page->inactive &&
2950                                     !result_page->throttled)
2951                                         vm_page_activate(result_page);
2952                                 vm_page_activate(new_page);
2953                                 vm_page_unlock_queues();
2954
2955                                 /*
2956                                  *      Release paging references and
2957                                  *      top-level placeholder page, if any.
2958                                  */
2959
2960                                 vm_fault_cleanup(result_page->object,
2961                                                  top_page);
2962
2963                                 break;
2964
2965                         case VM_FAULT_RETRY:
2966                                 break;
2967
2968                         case VM_FAULT_FICTITIOUS_SHORTAGE:
2969                                 vm_page_more_fictitious();
2970                                 break;
2971
2972                         case VM_FAULT_MEMORY_SHORTAGE:
2973                                 if (vm_page_wait(interruptible))
2974                                         break;
2975                                 /* fall thru */
2976
2977                         case VM_FAULT_INTERRUPTED:
2978                                 vm_object_lock(new_object);
2979                                 VM_PAGE_FREE(new_page);
2980                                 vm_object_unlock(new_object);
2981
2982                                 vm_object_deallocate(new_object);
2983                                 vm_object_deallocate(src_object);
2984                                 *_result_object = VM_OBJECT_NULL;
2985                                 return(MACH_SEND_INTERRUPTED);
2986
2987                         case VM_FAULT_SUCCESS_NO_VM_PAGE:
2988                                 /* success but no VM page: fail */
2989                                 vm_object_paging_end(src_object);
2990                                 vm_object_unlock(src_object);
2991                                 /*FALLTHROUGH*/
2992                         case VM_FAULT_MEMORY_ERROR:
2993                                 /*
2994                                  * A policy choice:
2995                                  *      (a) ignore pages that we can't
2996                                  *          copy
2997                                  *      (b) return the null object if
2998                                  *          any page fails [chosen]
2999                                  */
3000
3001                                 vm_object_lock(new_object);
3002                                 VM_PAGE_FREE(new_page);
3003                                 vm_object_unlock(new_object);
3004
3005                                 vm_object_deallocate(new_object);
3006                                 vm_object_deallocate(src_object);
3007                                 *_result_object = VM_OBJECT_NULL;
3008                                 return(error_code ? error_code:
3009                                        KERN_MEMORY_ERROR);
3010
3011                         default:
3012                                 panic("vm_object_copy_slowly: unexpected error"
3013                                       " 0x%x from vm_fault_page()\n", result);
3014                         }
3015                 } while (result != VM_FAULT_SUCCESS);
3016         }
3017
3018         /*
3019          *      Lose the extra reference, and return our object.
3020          */
3021         vm_object_deallocate(src_object);
3022         *_result_object = new_object;
3023         return(KERN_SUCCESS);
3024 }
3025
3026 /*
3027  *      Routine:        vm_object_copy_quickly
3028  *
3029  *      Purpose:
3030  *              Copy the specified range of the source virtual
3031  *              memory object, if it can be done without waiting
3032  *              for user-generated events.
3033  *
3034  *      Results:
3035  *              If the copy is successful, the copy is returned in
3036  *              the arguments; otherwise, the arguments are not
3037  *              affected.
3038  *
3039  *      In/out conditions:
3040  *              The object should be unlocked on entry and exit.
3041  */
3042
3043 /*ARGSUSED*/
3044 __private_extern__ boolean_t
3045 vm_object_copy_quickly(
3046         vm_object_t             *_object,               /* INOUT */
3047         __unused vm_object_offset_t     offset, /* IN */
3048         __unused vm_object_size_t       size,   /* IN */
3049         boolean_t               *_src_needs_copy,       /* OUT */
3050         boolean_t               *_dst_needs_copy)       /* OUT */
3051 {
3052         vm_object_t     object = *_object;
3053         memory_object_copy_strategy_t copy_strategy;
3054
3055         XPR(XPR_VM_OBJECT, "v_o_c_quickly obj 0x%x off 0x%x size 0x%x\n",
3056             *_object, offset, size, 0, 0);
3057         if (object == VM_OBJECT_NULL) {
3058                 *_src_needs_copy = FALSE;
3059                 *_dst_needs_copy = FALSE;
3060                 return(TRUE);
3061         }
3062
3063         vm_object_lock(object);
3064
3065         copy_strategy = object->copy_strategy;
3066
3067         switch (copy_strategy) {
3068         case MEMORY_OBJECT_COPY_SYMMETRIC:
3069
3070                 /*
3071                  *      Symmetric copy strategy.
3072                  *      Make another reference to the object.
3073                  *      Leave object/offset unchanged.
3074                  */
3075
3076                 vm_object_reference_locked(object);
3077                 object->shadowed = TRUE;
3078                 vm_object_unlock(object);
3079
3080                 /*
3081                  *      Both source and destination must make
3082                  *      shadows, and the source must be made
3083                  *      read-only if not already.
3084                  */
3085
3086                 *_src_needs_copy = TRUE;
3087                 *_dst_needs_copy = TRUE;
3088
3089                 break;
3090
3091         case MEMORY_OBJECT_COPY_DELAY:
3092                 vm_object_unlock(object);
3093                 return(FALSE);
3094
3095         default:
3096                 vm_object_unlock(object);
3097                 return(FALSE);
3098         }
3099         return(TRUE);
3100 }
3101
3102 static int copy_call_count = 0;
3103 static int copy_call_sleep_count = 0;
3104 static int copy_call_restart_count = 0;
3105
3106 /*
3107  *      Routine:        vm_object_copy_call [internal]
3108  *
3109  *      Description:
3110  *              Copy the source object (src_object), using the
3111  *              user-managed copy algorithm.
3112  *
3113  *      In/out conditions:
3114  *              The source object must be locked on entry.  It
3115  *              will be *unlocked* on exit.
3116  *
3117  *      Results:
3118  *              If the copy is successful, KERN_SUCCESS is returned.
3119  *              A new object that represents the copied virtual
3120  *              memory is returned in a parameter (*_result_object).
3121  *              If the return value indicates an error, this parameter
3122  *              is not valid.
3123  */
3124 static kern_return_t
3125 vm_object_copy_call(
3126         vm_object_t             src_object,
3127         vm_object_offset_t      src_offset,
3128         vm_object_size_t        size,
3129         vm_object_t             *_result_object)        /* OUT */
3130 {
3131         kern_return_t   kr;
3132         vm_object_t     copy;
3133         boolean_t       check_ready = FALSE;
3134         uint32_t        try_failed_count = 0;
3135
3136         /*
3137          *      If a copy is already in progress, wait and retry.
3138          *
3139          *      XXX
3140          *      Consider making this call interruptable, as Mike
3141          *      intended it to be.
3142          *
3143          *      XXXO
3144          *      Need a counter or version or something to allow
3145          *      us to use the copy that the currently requesting
3146          *      thread is obtaining -- is it worth adding to the
3147          *      vm object structure? Depends how common this case it.
3148          */
3149         copy_call_count++;
3150         while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) {
3151                 vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL,
3152                                THREAD_UNINT);
3153                 copy_call_restart_count++;
3154         }
3155
3156         /*
3157          *      Indicate (for the benefit of memory_object_create_copy)
3158          *      that we want a copy for src_object. (Note that we cannot
3159          *      do a real assert_wait before calling memory_object_copy,
3160          *      so we simply set the flag.)
3161          */
3162
3163         vm_object_set_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL);
3164         vm_object_unlock(src_object);
3165
3166         /*
3167          *      Ask the memory manager to give us a memory object
3168          *      which represents a copy of the src object.
3169          *      The memory manager may give us a memory object
3170          *      which we already have, or it may give us a
3171          *      new memory object. This memory object will arrive
3172          *      via memory_object_create_copy.
3173          */
3174
3175         kr = KERN_FAILURE;      /* XXX need to change memory_object.defs */
3176         if (kr != KERN_SUCCESS) {
3177                 return kr;
3178         }
3179
3180         /*
3181          *      Wait for the copy to arrive.
3182          */
3183         vm_object_lock(src_object);
3184         while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) {
3185                 vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL,
3186                                THREAD_UNINT);
3187                 copy_call_sleep_count++;
3188         }
3189 Retry:
3190         assert(src_object->copy != VM_OBJECT_NULL);
3191         copy = src_object->copy;
3192         if (!vm_object_lock_try(copy)) {
3193                 vm_object_unlock(src_object);
3194
3195                 try_failed_count++;
3196                 mutex_pause(try_failed_count);  /* wait a bit */
3197
3198                 vm_object_lock(src_object);
3199                 goto Retry;
3200         }
3201         if (copy->size < src_offset+size)
3202                 copy->size = src_offset+size;
3203
3204         if (!copy->pager_ready)
3205                 check_ready = TRUE;
3206
3207         /*
3208          *      Return the copy.
3209          */
3210         *_result_object = copy;
3211         vm_object_unlock(copy);
3212         vm_object_unlock(src_object);
3213
3214         /* Wait for the copy to be ready. */
3215         if (check_ready == TRUE) {
3216                 vm_object_lock(copy);
3217                 while (!copy->pager_ready) {
3218                         vm_object_sleep(copy, VM_OBJECT_EVENT_PAGER_READY, THREAD_UNINT);
3219                 }
3220                 vm_object_unlock(copy);
3221         }
3222
3223         return KERN_SUCCESS;
3224 }
3225
3226 static int copy_delayed_lock_collisions = 0;
3227 static int copy_delayed_max_collisions = 0;
3228 static int copy_delayed_lock_contention = 0;
3229 static int copy_delayed_protect_iterate = 0;
3230
3231 /*
3232  *      Routine:        vm_object_copy_delayed [internal]
3233  *
3234  *      Description:
3235  *              Copy the specified virtual memory object, using
3236  *              the asymmetric copy-on-write algorithm.
3237  *
3238  *      In/out conditions:
3239  *              The src_object must be locked on entry.  It will be unlocked
3240  *              on exit - so the caller must also hold a reference to it.
3241  *
3242  *              This routine will not block waiting for user-generated
3243  *              events.  It is not interruptible.
3244  */
3245 __private_extern__ vm_object_t
3246 vm_object_copy_delayed(
3247         vm_object_t             src_object,
3248         vm_object_offset_t      src_offset,
3249         vm_object_size_t        size,
3250         boolean_t               src_object_shared)
3251 {
3252         vm_object_t             new_copy = VM_OBJECT_NULL;
3253         vm_object_t             old_copy;
3254         vm_page_t               p;
3255         vm_object_size_t        copy_size = src_offset + size;
3256
3257
3258         int collisions = 0;
3259         /*
3260          *      The user-level memory manager wants to see all of the changes
3261          *      to this object, but it has promised not to make any changes on
3262          *      its own.
3263          *
3264          *      Perform an asymmetric copy-on-write, as follows:
3265          *              Create a new object, called a "copy object" to hold
3266          *               pages modified by the new mapping  (i.e., the copy,
3267          *               not the original mapping).
3268          *              Record the original object as the backing object for
3269          *               the copy object.  If the original mapping does not
3270          *               change a page, it may be used read-only by the copy.
3271          *              Record the copy object in the original object.
3272          *               When the original mapping causes a page to be modified,
3273          *               it must be copied to a new page that is "pushed" to
3274          *               the copy object.
3275          *              Mark the new mapping (the copy object) copy-on-write.
3276          *               This makes the copy object itself read-only, allowing
3277          *               it to be reused if the original mapping makes no
3278          *               changes, and simplifying the synchronization required
3279          *               in the "push" operation described above.
3280          *
3281          *      The copy-on-write is said to be assymetric because the original
3282          *      object is *not* marked copy-on-write. A copied page is pushed
3283          *      to the copy object, regardless which party attempted to modify
3284          *      the page.
3285          *
3286          *      Repeated asymmetric copy operations may be done. If the
3287          *      original object has not been changed since the last copy, its
3288          *      copy object can be reused. Otherwise, a new copy object can be
3289          *      inserted between the original object and its previous copy
3290          *      object.  Since any copy object is read-only, this cannot affect
3291          *      affect the contents of the previous copy object.
3292          *
3293          *      Note that a copy object is higher in the object tree than the
3294          *      original object; therefore, use of the copy object recorded in
3295          *      the original object must be done carefully, to avoid deadlock.
3296          */
3297
3298  Retry:
3299
3300         /*
3301          * Wait for paging in progress.
3302          */
3303         if (!src_object->true_share &&
3304             (src_object->paging_in_progress != 0 ||
3305              src_object->activity_in_progress != 0)) {
3306                 if (src_object_shared == TRUE) {
3307                         vm_object_unlock(src_object);
3308                         vm_object_lock(src_object);
3309                         src_object_shared = FALSE;
3310                         goto Retry;
3311                 }
3312                 vm_object_paging_wait(src_object, THREAD_UNINT);
3313         }
3314         /*
3315          *      See whether we can reuse the result of a previous
3316          *      copy operation.
3317          */
3318
3319         old_copy = src_object->copy;
3320         if (old_copy != VM_OBJECT_NULL) {
3321                 int lock_granted;
3322
3323                 /*
3324                  *      Try to get the locks (out of order)
3325                  */
3326                 if (src_object_shared == TRUE)
3327                         lock_granted = vm_object_lock_try_shared(old_copy);
3328                 else
3329                         lock_granted = vm_object_lock_try(old_copy);
3330
3331                 if (!lock_granted) {
3332                         vm_object_unlock(src_object);
3333
3334                         if (collisions++ == 0)
3335                                 copy_delayed_lock_contention++;
3336                         mutex_pause(collisions);
3337
3338                         /* Heisenberg Rules */
3339                         copy_delayed_lock_collisions++;
3340
3341                         if (collisions > copy_delayed_max_collisions)
3342                                 copy_delayed_max_collisions = collisions;
3343
3344                         if (src_object_shared == TRUE)
3345                                 vm_object_lock_shared(src_object);
3346                         else
3347                                 vm_object_lock(src_object);
3348
3349                         goto Retry;
3350                 }
3351
3352                 /*
3353                  *      Determine whether the old copy object has
3354                  *      been modified.
3355                  */
3356
3357                 if (old_copy->resident_page_count == 0 &&
3358                     !old_copy->pager_created) {
3359                         /*
3360                          *      It has not been modified.
3361                          *
3362                          *      Return another reference to
3363                          *      the existing copy-object if
3364                          *      we can safely grow it (if
3365                          *      needed).
3366                          */
3367
3368                         if (old_copy->size < copy_size) {
3369                                 if (src_object_shared == TRUE) {
3370                                         vm_object_unlock(old_copy);
3371                                         vm_object_unlock(src_object);
3372
3373                                         vm_object_lock(src_object);
3374                                         src_object_shared = FALSE;
3375                                         goto Retry;
3376                                 }
3377                                 /*
3378                                  * We can't perform a delayed copy if any of the
3379                                  * pages in the extended range are wired (because
3380                                  * we can't safely take write permission away from
3381                                  * wired pages).  If the pages aren't wired, then
3382                                  * go ahead and protect them.
3383                                  */
3384                                 copy_delayed_protect_iterate++;
3385
3386                                 queue_iterate(&src_object->memq, p, vm_page_t, listq) {
3387                                         if (!p->fictitious &&
3388                                             p->offset >= old_copy->size &&
3389                                             p->offset < copy_size) {
3390                                                 if (VM_PAGE_WIRED(p)) {
3391                                                         vm_object_unlock(old_copy);
3392                                                         vm_object_unlock(src_object);
3393
3394                                                         if (new_copy != VM_OBJECT_NULL) {
3395                                                                 vm_object_unlock(new_copy);
3396                                                                 vm_object_deallocate(new_copy);
3397                                                         }
3398
3399                                                         return VM_OBJECT_NULL;
3400                                                 } else {
3401                                                         pmap_page_protect(p->phys_page,
3402                                                                           (VM_PROT_ALL & ~VM_PROT_WRITE));
3403                                                 }
3404                                         }
3405                                 }
3406                                 old_copy->size = copy_size;
3407                         }
3408                         if (src_object_shared == TRUE)
3409                                 vm_object_reference_shared(old_copy);
3410                         else
3411                                 vm_object_reference_locked(old_copy);
3412                         vm_object_unlock(old_copy);
3413                         vm_object_unlock(src_object);
3414
3415                         if (new_copy != VM_OBJECT_NULL) {
3416                                 vm_object_unlock(new_copy);
3417                                 vm_object_deallocate(new_copy);
3418                         }
3419                         return(old_copy);
3420                 }
3421
3422
3423
3424                 /*
3425                  * Adjust the size argument so that the newly-created
3426                  * copy object will be large enough to back either the
3427                  * old copy object or the new mapping.
3428                  */
3429                 if (old_copy->size > copy_size)
3430                         copy_size = old_copy->size;
3431
3432                 if (new_copy == VM_OBJECT_NULL) {
3433                         vm_object_unlock(old_copy);
3434                         vm_object_unlock(src_object);
3435                         new_copy = vm_object_allocate(copy_size);
3436                         vm_object_lock(src_object);
3437                         vm_object_lock(new_copy);
3438
3439                         src_object_shared = FALSE;
3440                         goto Retry;
3441                 }
3442                 new_copy->size = copy_size;
3443
3444                 /*
3445                  *      The copy-object is always made large enough to
3446                  *      completely shadow the original object, since
3447                  *      it may have several users who want to shadow
3448                  *      the original object at different points.
3449                  */
3450
3451                 assert((old_copy->shadow == src_object) &&
3452                     (old_copy->shadow_offset == (vm_object_offset_t) 0));
3453
3454         } else if (new_copy == VM_OBJECT_NULL) {
3455                 vm_object_unlock(src_object);
3456                 new_copy = vm_object_allocate(copy_size);
3457                 vm_object_lock(src_object);
3458                 vm_object_lock(new_copy);
3459
3460                 src_object_shared = FALSE;
3461                 goto Retry;
3462         }
3463
3464         /*
3465          * We now have the src object locked, and the new copy object
3466          * allocated and locked (and potentially the old copy locked).
3467          * Before we go any further, make sure we can still perform
3468          * a delayed copy, as the situation may have changed.
3469          *
3470          * Specifically, we can't perform a delayed copy if any of the
3471          * pages in the range are wired (because we can't safely take
3472          * write permission away from wired pages).  If the pages aren't
3473          * wired, then go ahead and protect them.
3474          */
3475         copy_delayed_protect_iterate++;
3476
3477         queue_iterate(&src_object->memq, p, vm_page_t, listq) {
3478                 if (!p->fictitious && p->offset < copy_size) {
3479                         if (VM_PAGE_WIRED(p)) {
3480                                 if (old_copy)
3481                                         vm_object_unlock(old_copy);
3482                                 vm_object_unlock(src_object);
3483                                 vm_object_unlock(new_copy);
3484                                 vm_object_deallocate(new_copy);
3485                                 return VM_OBJECT_NULL;
3486                         } else {
3487                                 pmap_page_protect(p->phys_page,
3488                                                   (VM_PROT_ALL & ~VM_PROT_WRITE));
3489                         }
3490                 }
3491         }
3492         if (old_copy != VM_OBJECT_NULL) {
3493                 /*
3494                  *      Make the old copy-object shadow the new one.
3495                  *      It will receive no more pages from the original
3496                  *      object.
3497                  */
3498
3499                 /* remove ref. from old_copy */
3500                 vm_object_lock_assert_exclusive(src_object);
3501                 src_object->ref_count--;
3502                 assert(src_object->ref_count > 0);
3503                 vm_object_lock_assert_exclusive(old_copy);
3504                 old_copy->shadow = new_copy;
3505                 vm_object_lock_assert_exclusive(new_copy);
3506                 assert(new_copy->ref_count > 0);
3507                 new_copy->ref_count++;          /* for old_copy->shadow ref. */
3508
3509 #if TASK_SWAPPER
3510                 if (old_copy->res_count) {
3511                         VM_OBJ_RES_INCR(new_copy);
3512                         VM_OBJ_RES_DECR(src_object);
3513                 }
3514 #endif
3515
3516                 vm_object_unlock(old_copy);     /* done with old_copy */
3517         }
3518
3519         /*
3520          *      Point the new copy at the existing object.
3521          */
3522         vm_object_lock_assert_exclusive(new_copy);
3523         new_copy->shadow = src_object;
3524         new_copy->shadow_offset = 0;
3525         new_copy->shadowed = TRUE;      /* caller must set needs_copy */
3526
3527         vm_object_lock_assert_exclusive(src_object);
3528         vm_object_reference_locked(src_object);
3529         src_object->copy = new_copy;
3530         vm_object_unlock(src_object);
3531         vm_object_unlock(new_copy);
3532
3533         XPR(XPR_VM_OBJECT,
3534                 "vm_object_copy_delayed: used copy object %X for source %X\n",
3535                 new_copy, src_object, 0, 0, 0);
3536
3537         return new_copy;
3538 }
3539
3540 /*
3541  *      Routine:        vm_object_copy_strategically
3542  *
3543  *      Purpose:
3544  *              Perform a copy according to the source object's
3545  *              declared strategy.  This operation may block,
3546  *              and may be interrupted.
3547  */
3548 __private_extern__ kern_return_t
3549 vm_object_copy_strategically(
3550         register vm_object_t    src_object,
3551         vm_object_offset_t      src_offset,
3552         vm_object_size_t        size,
3553         vm_object_t             *dst_object,    /* OUT */
3554         vm_object_offset_t      *dst_offset,    /* OUT */
3555         boolean_t               *dst_needs_copy) /* OUT */
3556 {
3557         boolean_t       result;
3558         boolean_t       interruptible = THREAD_ABORTSAFE; /* XXX */
3559         boolean_t       object_lock_shared = FALSE;
3560         memory_object_copy_strategy_t copy_strategy;
3561
3562         assert(src_object != VM_OBJECT_NULL);
3563
3564         copy_strategy = src_object->copy_strategy;
3565
3566         if (copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3567                 vm_object_lock_shared(src_object);
3568                 object_lock_shared = TRUE;
3569         } else
3570                 vm_object_lock(src_object);
3571
3572         /*
3573          *      The copy strategy is only valid if the memory manager
3574          *      is "ready". Internal objects are always ready.
3575          */
3576
3577         while (!src_object->internal && !src_object->pager_ready) {
3578                 wait_result_t wait_result;
3579
3580                 if (object_lock_shared == TRUE) {
3581                         vm_object_unlock(src_object);
3582                         vm_object_lock(src_object);
3583                         object_lock_shared = FALSE;
3584                         continue;
3585                 }
3586                 wait_result = vm_object_sleep(  src_object,
3587                                                 VM_OBJECT_EVENT_PAGER_READY,
3588                                                 interruptible);
3589                 if (wait_result != THREAD_AWAKENED) {
3590                         vm_object_unlock(src_object);
3591                         *dst_object = VM_OBJECT_NULL;
3592                         *dst_offset = 0;
3593                         *dst_needs_copy = FALSE;
3594                         return(MACH_SEND_INTERRUPTED);
3595                 }
3596         }
3597
3598         /*
3599          *      Use the appropriate copy strategy.
3600          */
3601
3602         switch (copy_strategy) {
3603             case MEMORY_OBJECT_COPY_DELAY:
3604                 *dst_object = vm_object_copy_delayed(src_object,
3605                                                      src_offset, size, object_lock_shared);
3606                 if (*dst_object != VM_OBJECT_NULL) {
3607                         *dst_offset = src_offset;
3608                         *dst_needs_copy = TRUE;
3609                         result = KERN_SUCCESS;
3610                         break;
3611                 }
3612                 vm_object_lock(src_object);
3613                 /* fall thru when delayed copy not allowed */
3614
3615             case MEMORY_OBJECT_COPY_NONE:
3616                 result = vm_object_copy_slowly(src_object, src_offset, size,
3617                                                interruptible, dst_object);
3618                 if (result == KERN_SUCCESS) {
3619                         *dst_offset = 0;
3620                         *dst_needs_copy = FALSE;
3621                 }
3622                 break;
3623
3624             case MEMORY_OBJECT_COPY_CALL:
3625                 result = vm_object_copy_call(src_object, src_offset, size,
3626                                 dst_object);
3627                 if (result == KERN_SUCCESS) {
3628                         *dst_offset = src_offset;
3629                         *dst_needs_copy = TRUE;
3630                 }
3631                 break;
3632
3633             case MEMORY_OBJECT_COPY_SYMMETRIC:
3634                 XPR(XPR_VM_OBJECT, "v_o_c_strategically obj 0x%x off 0x%x size 0x%x\n", src_object, src_offset, size, 0, 0);
3635                 vm_object_unlock(src_object);
3636                 result = KERN_MEMORY_RESTART_COPY;
3637                 break;
3638
3639             default:
3640                 panic("copy_strategically: bad strategy");
3641                 result = KERN_INVALID_ARGUMENT;
3642         }
3643         return(result);
3644 }
3645
3646 /*
3647  *      vm_object_shadow:
3648  *
3649  *      Create a new object which is backed by the
3650  *      specified existing object range.  The source
3651  *      object reference is deallocated.
3652  *
3653  *      The new object and offset into that object
3654  *      are returned in the source parameters.
3655  */
3656 boolean_t vm_object_shadow_check = FALSE;
3657
3658 __private_extern__ boolean_t
3659 vm_object_shadow(
3660         vm_object_t             *object,        /* IN/OUT */
3661         vm_object_offset_t      *offset,        /* IN/OUT */
3662         vm_object_size_t        length)
3663 {
3664         register vm_object_t    source;
3665         register vm_object_t    result;
3666
3667         source = *object;
3668 #if 0
3669         /*
3670          * XXX FBDP
3671          * This assertion is valid but it gets triggered by Rosetta for example
3672          * due to a combination of vm_remap() that changes a VM object's
3673          * copy_strategy from SYMMETRIC to DELAY and vm_protect(VM_PROT_COPY)
3674          * that then sets "needs_copy" on its map entry.  This creates a
3675          * mapping situation that VM should never see and doesn't know how to
3676          * handle.
3677          * It's not clear if this can create any real problem but we should
3678          * look into fixing this, probably by having vm_protect(VM_PROT_COPY)
3679          * do more than just set "needs_copy" to handle the copy-on-write...
3680          * In the meantime, let's disable the assertion.
3681          */
3682         assert(source->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
3683 #endif
3684
3685         /*
3686          *      Determine if we really need a shadow.
3687          */
3688
3689         if (vm_object_shadow_check && source->ref_count == 1 &&
3690             (source->shadow == VM_OBJECT_NULL ||
3691              source->shadow->copy == VM_OBJECT_NULL))
3692         {
3693                 source->shadowed = FALSE;
3694                 return FALSE;
3695         }
3696
3697         /*
3698          *      Allocate a new object with the given length
3699          */
3700
3701         if ((result = vm_object_allocate(length)) == VM_OBJECT_NULL)
3702                 panic("vm_object_shadow: no object for shadowing");
3703
3704         /*
3705          *      The new object shadows the source object, adding
3706          *      a reference to it.  Our caller changes his reference
3707          *      to point to the new object, removing a reference to
3708          *      the source object.  Net result: no change of reference
3709          *      count.
3710          */
3711         result->shadow = source;
3712
3713         /*
3714          *      Store the offset into the source object,
3715          *      and fix up the offset into the new object.
3716          */
3717
3718         result->shadow_offset = *offset;
3719
3720         /*
3721          *      Return the new things
3722          */
3723
3724         *offset = 0;
3725         *object = result;
3726         return TRUE;
3727 }
3728
3729 /*
3730  *      The relationship between vm_object structures and
3731  *      the memory_object requires careful synchronization.
3732  *
3733  *      All associations are created by memory_object_create_named
3734  *  for external pagers and vm_object_pager_create for internal
3735  *  objects as follows:
3736  *
3737  *              pager:  the memory_object itself, supplied by
3738  *                      the user requesting a mapping (or the kernel,
3739  *                      when initializing internal objects); the
3740  *                      kernel simulates holding send rights by keeping
3741  *                      a port reference;
3742  *
3743  *              pager_request:
3744  *                      the memory object control port,
3745  *                      created by the kernel; the kernel holds
3746  *                      receive (and ownership) rights to this
3747  *                      port, but no other references.
3748  *
3749  *      When initialization is complete, the "initialized" field
3750  *      is asserted.  Other mappings using a particular memory object,
3751  *      and any references to the vm_object gained through the
3752  *      port association must wait for this initialization to occur.
3753  *
3754  *      In order to allow the memory manager to set attributes before
3755  *      requests (notably virtual copy operations, but also data or
3756  *      unlock requests) are made, a "ready" attribute is made available.
3757  *      Only the memory manager may affect the value of this attribute.
3758  *      Its value does not affect critical kernel functions, such as
3759  *      internal object initialization or destruction.  [Furthermore,
3760  *      memory objects created by the kernel are assumed to be ready
3761  *      immediately; the default memory manager need not explicitly
3762  *      set the "ready" attribute.]
3763  *
3764  *      [Both the "initialized" and "ready" attribute wait conditions
3765  *      use the "pager" field as the wait event.]
3766  *
3767  *      The port associations can be broken down by any of the
3768  *      following routines:
3769  *              vm_object_terminate:
3770  *                      No references to the vm_object remain, and
3771  *                      the object cannot (or will not) be cached.
3772  *                      This is the normal case, and is done even
3773  *                      though one of the other cases has already been
3774  *                      done.
3775  *              memory_object_destroy:
3776  *                      The memory manager has requested that the
3777  *                      kernel relinquish references to the memory
3778  *                      object. [The memory manager may not want to
3779  *                      destroy the memory object, but may wish to
3780  *                      refuse or tear down existing memory mappings.]
3781  *
3782  *      Each routine that breaks an association must break all of
3783  *      them at once.  At some later time, that routine must clear
3784  *      the pager field and release the memory object references.
3785  *      [Furthermore, each routine must cope with the simultaneous
3786  *      or previous operations of the others.]
3787  *
3788  *      In addition to the lock on the object, the vm_object_hash_lock
3789  *      governs the associations.  References gained through the
3790  *      association require use of the hash lock.
3791  *
3792  *      Because the pager field may be cleared spontaneously, it
3793  *      cannot be used to determine whether a memory object has
3794  *      ever been associated with a particular vm_object.  [This
3795  *      knowledge is important to the shadow object mechanism.]
3796  *      For this reason, an additional "created" attribute is
3797  *      provided.
3798  *
3799  *      During various paging operations, the pager reference found in the
3800  *      vm_object must be valid.  To prevent this from being released,
3801  *      (other than being removed, i.e., made null), routines may use
3802  *      the vm_object_paging_begin/end routines [actually, macros].
3803  *      The implementation uses the "paging_in_progress" and "wanted" fields.
3804  *      [Operations that alter the validity of the pager values include the
3805  *      termination routines and vm_object_collapse.]
3806  */
3807
3808
3809 /*
3810  *      Routine:        vm_object_enter
3811  *      Purpose:
3812  *              Find a VM object corresponding to the given
3813  *              pager; if no such object exists, create one,
3814  *              and initialize the pager.
3815  */
3816 vm_object_t
3817 vm_object_enter(
3818         memory_object_t         pager,
3819         vm_object_size_t        size,
3820         boolean_t               internal,
3821         boolean_t               init,
3822         boolean_t               named)
3823 {
3824         register vm_object_t    object;
3825         vm_object_t             new_object;
3826         boolean_t               must_init;
3827         vm_object_hash_entry_t  entry, new_entry;
3828         uint32_t        try_failed_count = 0;
3829         lck_mtx_t       *lck;
3830
3831         if (pager == MEMORY_OBJECT_NULL)
3832                 return(vm_object_allocate(size));
3833
3834         new_object = VM_OBJECT_NULL;
3835         new_entry = VM_OBJECT_HASH_ENTRY_NULL;
3836         must_init = init;
3837
3838         /*
3839          *      Look for an object associated with this port.
3840          */
3841 Retry:
3842         lck = vm_object_hash_lock_spin(pager);
3843         do {
3844                 entry = vm_object_hash_lookup(pager, FALSE);
3845
3846                 if (entry == VM_OBJECT_HASH_ENTRY_NULL) {
3847                         if (new_object == VM_OBJECT_NULL) {
3848                                 /*
3849                                  *      We must unlock to create a new object;
3850                                  *      if we do so, we must try the lookup again.
3851                                  */
3852                                 vm_object_hash_unlock(lck);
3853                                 assert(new_entry == VM_OBJECT_HASH_ENTRY_NULL);
3854                                 new_entry = vm_object_hash_entry_alloc(pager);
3855                                 new_object = vm_object_allocate(size);
3856                                 lck = vm_object_hash_lock_spin(pager);
3857                         } else {
3858                                 /*
3859                                  *      Lookup failed twice, and we have something
3860                                  *      to insert; set the object.
3861                                  */
3862                                 vm_object_hash_insert(new_entry, new_object);
3863                                 entry = new_entry;
3864                                 new_entry = VM_OBJECT_HASH_ENTRY_NULL;
3865                                 new_object = VM_OBJECT_NULL;
3866                                 must_init = TRUE;
3867                         }
3868                 } else if (entry->object == VM_OBJECT_NULL) {
3869                         /*
3870                          *      If a previous object is being terminated,
3871                          *      we must wait for the termination message
3872                          *      to be queued (and lookup the entry again).
3873                          */
3874                         entry->waiting = TRUE;
3875                         entry = VM_OBJECT_HASH_ENTRY_NULL;
3876                         assert_wait((event_t) pager, THREAD_UNINT);
3877                         vm_object_hash_unlock(lck);
3878
3879                         thread_block(THREAD_CONTINUE_NULL);
3880                         lck = vm_object_hash_lock_spin(pager);
3881                 }
3882         } while (entry == VM_OBJECT_HASH_ENTRY_NULL);
3883
3884         object = entry->object;
3885         assert(object != VM_OBJECT_NULL);
3886
3887         if (!must_init) {
3888                 if ( !vm_object_lock_try(object)) {
3889
3890                         vm_object_hash_unlock(lck);
3891
3892                         try_failed_count++;
3893                         mutex_pause(try_failed_count);  /* wait a bit */
3894                         goto Retry;
3895                 }
3896                 assert(!internal || object->internal);
3897 #if VM_OBJECT_CACHE
3898                 if (object->ref_count == 0) {
3899                         if ( !vm_object_cache_lock_try()) {
3900
3901                                 vm_object_hash_unlock(lck);
3902                                 vm_object_unlock(object);
3903
3904                                 try_failed_count++;
3905                                 mutex_pause(try_failed_count);  /* wait a bit */
3906                                 goto Retry;
3907                         }
3908                         XPR(XPR_VM_OBJECT_CACHE,
3909                             "vm_object_enter: removing %x from cache, head (%x, %x)\n",
3910                                 object,
3911                                 vm_object_cached_list.next,
3912                                 vm_object_cached_list.prev, 0,0);
3913                         queue_remove(&vm_object_cached_list, object,
3914                                      vm_object_t, cached_list);
3915                         vm_object_cached_count--;
3916
3917                         vm_object_cache_unlock();
3918                 }
3919 #endif
3920                 if (named) {
3921                         assert(!object->named);
3922                         object->named = TRUE;
3923                 }
3924                 vm_object_lock_assert_exclusive(object);
3925                 object->ref_count++;
3926                 vm_object_res_reference(object);
3927
3928                 vm_object_hash_unlock(lck);
3929                 vm_object_unlock(object);
3930
3931                 VM_STAT_INCR(hits);
3932         } else
3933                 vm_object_hash_unlock(lck);
3934
3935         assert(object->ref_count > 0);
3936
3937         VM_STAT_INCR(lookups);
3938
3939         XPR(XPR_VM_OBJECT,
3940                 "vm_o_enter: pager 0x%x obj 0x%x must_init %d\n",
3941                 pager, object, must_init, 0, 0);
3942
3943         /*
3944          *      If we raced to create a vm_object but lost, let's
3945          *      throw away ours.
3946          */
3947
3948         if (new_object != VM_OBJECT_NULL)
3949                 vm_object_deallocate(new_object);
3950
3951         if (new_entry != VM_OBJECT_HASH_ENTRY_NULL)
3952                 vm_object_hash_entry_free(new_entry);
3953
3954         if (must_init) {
3955                 memory_object_control_t control;
3956
3957                 /*
3958                  *      Allocate request port.
3959                  */
3960
3961                 control = memory_object_control_allocate(object);
3962                 assert (control != MEMORY_OBJECT_CONTROL_NULL);
3963
3964                 vm_object_lock(object);
3965                 assert(object != kernel_object);
3966
3967                 /*
3968                  *      Copy the reference we were given.
3969                  */
3970
3971                 memory_object_reference(pager);
3972                 object->pager_created = TRUE;
3973                 object->pager = pager;
3974                 object->internal = internal;
3975                 object->pager_trusted = internal;
3976                 if (!internal) {
3977                         /* copy strategy invalid until set by memory manager */
3978                         object->copy_strategy = MEMORY_OBJECT_COPY_INVALID;
3979                 }
3980                 object->pager_control = control;
3981                 object->pager_ready = FALSE;
3982
3983                 vm_object_unlock(object);
3984
3985                 /*
3986                  *      Let the pager know we're using it.
3987                  */
3988
3989                 (void) memory_object_init(pager,
3990                         object->pager_control,
3991                         PAGE_SIZE);
3992
3993                 vm_object_lock(object);
3994                 if (named)
3995                         object->named = TRUE;
3996                 if (internal) {
3997                         object->pager_ready = TRUE;
3998                         vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY);
3999                 }
4000
4001                 object->pager_initialized = TRUE;
4002                 vm_object_wakeup(object, VM_OBJECT_EVENT_INITIALIZED);
4003         } else {
4004                 vm_object_lock(object);
4005         }
4006
4007         /*
4008          *      [At this point, the object must be locked]
4009          */
4010
4011         /*
4012          *      Wait for the work above to be done by the first
4013          *      thread to map this object.
4014          */
4015
4016         while (!object->pager_initialized) {
4017                 vm_object_sleep(object,
4018                                 VM_OBJECT_EVENT_INITIALIZED,
4019                                 THREAD_UNINT);
4020         }
4021         vm_object_unlock(object);
4022
4023         XPR(XPR_VM_OBJECT,
4024             "vm_object_enter: vm_object %x, memory_object %x, internal %d\n",
4025             object, object->pager, internal, 0,0);
4026         return(object);
4027 }
4028
4029 /*
4030  *      Routine:        vm_object_pager_create
4031  *      Purpose:
4032  *              Create a memory object for an internal object.
4033  *      In/out conditions:
4034  *              The object is locked on entry and exit;
4035  *              it may be unlocked within this call.
4036  *      Limitations:
4037  *              Only one thread may be performing a
4038  *              vm_object_pager_create on an object at
4039  *              a time.  Presumably, only the pageout
4040  *              daemon will be using this routine.
4041  */
4042
4043 void
4044 vm_object_pager_create(
4045         register vm_object_t    object)
4046 {
4047         memory_object_t         pager;
4048         vm_object_hash_entry_t  entry;
4049         lck_mtx_t               *lck;
4050 #if     MACH_PAGEMAP
4051         vm_object_size_t        size;
4052         vm_external_map_t       map;
4053 #endif  /* MACH_PAGEMAP */
4054
4055         XPR(XPR_VM_OBJECT, "vm_object_pager_create, object 0x%X\n",
4056                 object, 0,0,0,0);
4057
4058         assert(object != kernel_object);
4059
4060         if (memory_manager_default_check() != KERN_SUCCESS)
4061                 return;
4062
4063         /*
4064          *      Prevent collapse or termination by holding a paging reference
4065          */
4066
4067         vm_object_paging_begin(object);
4068         if (object->pager_created) {
4069                 /*
4070                  *      Someone else got to it first...
4071                  *      wait for them to finish initializing the ports
4072                  */
4073                 while (!object->pager_initialized) {
4074                         vm_object_sleep(object,
4075                                         VM_OBJECT_EVENT_INITIALIZED,
4076                                         THREAD_UNINT);
4077                 }
4078                 vm_object_paging_end(object);
4079                 return;
4080         }
4081
4082         /*
4083          *      Indicate that a memory object has been assigned
4084          *      before dropping the lock, to prevent a race.
4085          */
4086
4087         object->pager_created = TRUE;
4088         object->paging_offset = 0;
4089
4090 #if     MACH_PAGEMAP
4091         size = object->size;
4092 #endif  /* MACH_PAGEMAP */
4093         vm_object_unlock(object);
4094
4095 #if     MACH_PAGEMAP
4096         map = vm_external_create(size);
4097         vm_object_lock(object);
4098         assert(object->size == size);
4099         object->existence_map = map;
4100         vm_object_unlock(object);
4101 #endif  /* MACH_PAGEMAP */
4102
4103         if ((uint32_t) object->size != object->size) {
4104                 panic("vm_object_pager_create(): object size 0x%llx >= 4GB\n",
4105                       (uint64_t) object->size);
4106         }
4107
4108         /*
4109          *      Create the [internal] pager, and associate it with this object.
4110          *
4111          *      We make the association here so that vm_object_enter()
4112          *      can look up the object to complete initializing it.  No
4113          *      user will ever map this object.
4114          */
4115         {
4116                 memory_object_default_t         dmm;
4117
4118                 /* acquire a reference for the default memory manager */
4119                 dmm = memory_manager_default_reference();
4120
4121                 assert(object->temporary);
4122
4123                 /* create our new memory object */
4124                 assert((vm_size_t) object->size == object->size);
4125                 (void) memory_object_create(dmm, (vm_size_t) object->size,
4126                                             &pager);
4127
4128                 memory_object_default_deallocate(dmm);
4129        }
4130
4131         entry = vm_object_hash_entry_alloc(pager);
4132
4133         lck = vm_object_hash_lock_spin(pager);
4134         vm_object_hash_insert(entry, object);
4135         vm_object_hash_unlock(lck);
4136
4137         /*
4138          *      A reference was returned by
4139          *      memory_object_create(), and it is
4140          *      copied by vm_object_enter().
4141          */
4142
4143         if (vm_object_enter(pager, object->size, TRUE, TRUE, FALSE) != object)
4144                 panic("vm_object_pager_create: mismatch");
4145
4146         /*
4147          *      Drop the reference we were passed.
4148          */
4149         memory_object_deallocate(pager);
4150
4151         vm_object_lock(object);
4152
4153         /*
4154          *      Release the paging reference
4155          */
4156         vm_object_paging_end(object);
4157 }
4158
4159 /*
4160  *      Routine:        vm_object_remove
4161  *      Purpose:
4162  *              Eliminate the pager/object association
4163  *              for this pager.
4164  *      Conditions:
4165  *              The object cache must be locked.
4166  */
4167 __private_extern__ void
4168 vm_object_remove(
4169         vm_object_t     object)
4170 {
4171         memory_object_t pager;
4172
4173         if ((pager = object->pager) != MEMORY_OBJECT_NULL) {
4174                 vm_object_hash_entry_t  entry;
4175
4176                 entry = vm_object_hash_lookup(pager, FALSE);
4177                 if (entry != VM_OBJECT_HASH_ENTRY_NULL)
4178                         entry->object = VM_OBJECT_NULL;
4179         }
4180
4181 }
4182
4183 /*
4184  *      Global variables for vm_object_collapse():
4185  *
4186  *              Counts for normal collapses and bypasses.
4187  *              Debugging variables, to watch or disable collapse.
4188  */
4189 static long     object_collapses = 0;
4190 static long     object_bypasses  = 0;
4191
4192 static boolean_t        vm_object_collapse_allowed = TRUE;
4193 static boolean_t        vm_object_bypass_allowed = TRUE;
4194
4195 #if MACH_PAGEMAP
4196 static int      vm_external_discarded;
4197 static int      vm_external_collapsed;
4198 #endif
4199
4200 unsigned long vm_object_collapse_encrypted = 0;
4201
4202 /*
4203  *      Routine:        vm_object_do_collapse
4204  *      Purpose:
4205  *              Collapse an object with the object backing it.
4206  *              Pages in the backing object are moved into the
4207  *              parent, and the backing object is deallocated.
4208  *      Conditions:
4209  *              Both objects and the cache are locked; the page
4210  *              queues are unlocked.
4211  *
4212  */
4213 static void
4214 vm_object_do_collapse(
4215         vm_object_t object,
4216         vm_object_t backing_object)
4217 {
4218         vm_page_t p, pp;
4219         vm_object_offset_t new_offset, backing_offset;
4220         vm_object_size_t size;
4221
4222         vm_object_lock_assert_exclusive(object);
4223         vm_object_lock_assert_exclusive(backing_object);
4224
4225         backing_offset = object->shadow_offset;
4226         size = object->size;
4227
4228         /*
4229          *      Move all in-memory pages from backing_object
4230          *      to the parent.  Pages that have been paged out
4231          *      will be overwritten by any of the parent's
4232          *      pages that shadow them.
4233          */
4234
4235         while (!queue_empty(&backing_object->memq)) {
4236
4237                 p = (vm_page_t) queue_first(&backing_object->memq);
4238
4239                 new_offset = (p->offset - backing_offset);
4240
4241                 assert(!p->busy || p->absent);
4242
4243                 /*
4244                  *      If the parent has a page here, or if
4245                  *      this page falls outside the parent,
4246                  *      dispose of it.
4247                  *
4248                  *      Otherwise, move it as planned.
4249                  */
4250
4251                 if (p->offset < backing_offset || new_offset >= size) {
4252                         VM_PAGE_FREE(p);
4253                 } else {
4254                         /*
4255                          * ENCRYPTED SWAP:
4256                          * The encryption key includes the "pager" and the
4257                          * "paging_offset".  These will not change during the
4258                          * object collapse, so we can just move an encrypted
4259                          * page from one object to the other in this case.
4260                          * We can't decrypt the page here, since we can't drop
4261                          * the object lock.
4262                          */
4263                         if (p->encrypted) {
4264                                 vm_object_collapse_encrypted++;
4265                         }
4266                         pp = vm_page_lookup(object, new_offset);
4267                         if (pp == VM_PAGE_NULL) {
4268
4269                                 /*
4270                                  *      Parent now has no page.
4271                                  *      Move the backing object's page up.
4272                                  */
4273
4274                                 vm_page_rename(p, object, new_offset, TRUE);
4275 #if     MACH_PAGEMAP
4276                         } else if (pp->absent) {
4277
4278                                 /*
4279                                  *      Parent has an absent page...
4280                                  *      it's not being paged in, so
4281                                  *      it must really be missing from
4282                                  *      the parent.
4283                                  *
4284                                  *      Throw out the absent page...
4285                                  *      any faults looking for that
4286                                  *      page will restart with the new
4287                                  *      one.
4288                                  */
4289
4290                                 VM_PAGE_FREE(pp);
4291                                 vm_page_rename(p, object, new_offset, TRUE);
4292 #endif  /* MACH_PAGEMAP */
4293                         } else {
4294                                 assert(! pp->absent);
4295
4296                                 /*
4297                                  *      Parent object has a real page.
4298                                  *      Throw away the backing object's
4299                                  *      page.
4300                                  */
4301                                 VM_PAGE_FREE(p);
4302                         }
4303                 }
4304         }
4305
4306 #if     !MACH_PAGEMAP
4307         assert((!object->pager_created && (object->pager == MEMORY_OBJECT_NULL))
4308                 || (!backing_object->pager_created
4309                 &&  (backing_object->pager == MEMORY_OBJECT_NULL)));
4310 #else
4311         assert(!object->pager_created && object->pager == MEMORY_OBJECT_NULL);
4312 #endif  /* !MACH_PAGEMAP */
4313
4314         if (backing_object->pager != MEMORY_OBJECT_NULL) {
4315                 vm_object_hash_entry_t  entry;
4316
4317                 /*
4318                  *      Move the pager from backing_object to object.
4319                  *
4320                  *      XXX We're only using part of the paging space
4321                  *      for keeps now... we ought to discard the
4322                  *      unused portion.
4323                  */
4324
4325                 assert(!object->paging_in_progress);
4326                 assert(!object->activity_in_progress);
4327                 object->pager = backing_object->pager;
4328
4329                 if (backing_object->hashed) {
4330                         lck_mtx_t       *lck;
4331
4332                         lck = vm_object_hash_lock_spin(backing_object->pager);
4333                         entry = vm_object_hash_lookup(object->pager, FALSE);
4334                         assert(entry != VM_OBJECT_HASH_ENTRY_NULL);
4335                         entry->object = object;
4336                         vm_object_hash_unlock(lck);
4337
4338                         object->hashed = TRUE;
4339                 }
4340                 object->pager_created = backing_object->pager_created;
4341                 object->pager_control = backing_object->pager_control;
4342                 object->pager_ready = backing_object->pager_ready;
4343                 object->pager_initialized = backing_object->pager_initialized;
4344                 object->paging_offset =
4345                     backing_object->paging_offset + backing_offset;
4346                 if (object->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
4347                         memory_object_control_collapse(object->pager_control,
4348                                                        object);
4349                 }
4350         }
4351
4352 #if     MACH_PAGEMAP
4353         /*
4354          *      If the shadow offset is 0, the use the existence map from
4355          *      the backing object if there is one. If the shadow offset is
4356          *      not zero, toss it.
4357          *
4358          *      XXX - If the shadow offset is not 0 then a bit copy is needed
4359          *      if the map is to be salvaged.  For now, we just just toss the
4360          *      old map, giving the collapsed object no map. This means that
4361          *      the pager is invoked for zero fill pages.  If analysis shows
4362          *      that this happens frequently and is a performance hit, then
4363          *      this code should be fixed to salvage the map.
4364          */
4365         assert(object->existence_map == VM_EXTERNAL_NULL);
4366         if (backing_offset || (size != backing_object->size)) {
4367                 vm_external_discarded++;
4368                 vm_external_destroy(backing_object->existence_map,
4369                         backing_object->size);
4370         }
4371         else {
4372                 vm_external_collapsed++;
4373                 object->existence_map = backing_object->existence_map;
4374         }
4375         backing_object->existence_map = VM_EXTERNAL_NULL;
4376 #endif  /* MACH_PAGEMAP */
4377
4378         /*
4379          *      Object now shadows whatever backing_object did.
4380          *      Note that the reference to backing_object->shadow
4381          *      moves from within backing_object to within object.
4382          */
4383
4384         assert(!object->phys_contiguous);
4385         assert(!backing_object->phys_contiguous);
4386         object->shadow = backing_object->shadow;
4387         if (object->shadow) {
4388                 object->shadow_offset += backing_object->shadow_offset;
4389         } else {
4390                 /* no shadow, therefore no shadow offset... */
4391                 object->shadow_offset = 0;
4392         }
4393         assert((object->shadow == VM_OBJECT_NULL) ||
4394                (object->shadow->copy != backing_object));
4395
4396         /*
4397          *      Discard backing_object.
4398          *
4399          *      Since the backing object has no pages, no
4400          *      pager left, and no object references within it,
4401          *      all that is necessary is to dispose of it.
4402          */
4403
4404         assert((backing_object->ref_count == 1) &&
4405                (backing_object->resident_page_count == 0) &&
4406                (backing_object->paging_in_progress == 0) &&
4407                (backing_object->activity_in_progress == 0));
4408
4409         backing_object->alive = FALSE;
4410         vm_object_unlock(backing_object);
4411
4412         XPR(XPR_VM_OBJECT, "vm_object_collapse, collapsed 0x%X\n",
4413                 backing_object, 0,0,0,0);
4414
4415         vm_object_lock_destroy(backing_object);
4416
4417         zfree(vm_object_zone, backing_object);
4418
4419         object_collapses++;
4420 }
4421
4422 static void
4423 vm_object_do_bypass(
4424         vm_object_t object,
4425         vm_object_t backing_object)
4426 {
4427         /*
4428          *      Make the parent shadow the next object
4429          *      in the chain.
4430          */
4431
4432         vm_object_lock_assert_exclusive(object);
4433         vm_object_lock_assert_exclusive(backing_object);
4434
4435 #if     TASK_SWAPPER
4436         /*
4437          *      Do object reference in-line to
4438          *      conditionally increment shadow's
4439          *      residence count.  If object is not
4440          *      resident, leave residence count
4441          *      on shadow alone.
4442          */
4443         if (backing_object->shadow != VM_OBJECT_NULL) {
4444                 vm_object_lock(backing_object->shadow);
4445                 vm_object_lock_assert_exclusive(backing_object->shadow);
4446                 backing_object->shadow->ref_count++;
4447                 if (object->res_count != 0)
4448                         vm_object_res_reference(backing_object->shadow);
4449                 vm_object_unlock(backing_object->shadow);
4450         }
4451 #else   /* TASK_SWAPPER */
4452         vm_object_reference(backing_object->shadow);
4453 #endif  /* TASK_SWAPPER */
4454
4455         assert(!object->phys_contiguous);
4456         assert(!backing_object->phys_contiguous);
4457         object->shadow = backing_object->shadow;
4458         if (object->shadow) {
4459                 object->shadow_offset += backing_object->shadow_offset;
4460         } else {
4461                 /* no shadow, therefore no shadow offset... */
4462                 object->shadow_offset = 0;
4463         }
4464
4465         /*
4466          *      Backing object might have had a copy pointer
4467          *      to us.  If it did, clear it.
4468          */
4469         if (backing_object->copy == object) {
4470                 backing_object->copy = VM_OBJECT_NULL;
4471         }
4472
4473         /*
4474          *      Drop the reference count on backing_object.
4475 #if     TASK_SWAPPER
4476          *      Since its ref_count was at least 2, it
4477          *      will not vanish; so we don't need to call
4478          *      vm_object_deallocate.
4479          *      [with a caveat for "named" objects]
4480          *
4481          *      The res_count on the backing object is
4482          *      conditionally decremented.  It's possible
4483          *      (via vm_pageout_scan) to get here with
4484          *      a "swapped" object, which has a 0 res_count,
4485          *      in which case, the backing object res_count
4486          *      is already down by one.
4487 #else
4488          *      Don't call vm_object_deallocate unless
4489          *      ref_count drops to zero.
4490          *
4491          *      The ref_count can drop to zero here if the
4492          *      backing object could be bypassed but not
4493          *      collapsed, such as when the backing object
4494          *      is temporary and cachable.
4495 #endif
4496          */
4497         if (backing_object->ref_count > 2 ||
4498             (!backing_object->named && backing_object->ref_count > 1)) {
4499                 vm_object_lock_assert_exclusive(backing_object);
4500                 backing_object->ref_count--;
4501 #if     TASK_SWAPPER
4502                 if (object->res_count != 0)
4503                         vm_object_res_deallocate(backing_object);
4504                 assert(backing_object->ref_count > 0);
4505 #endif  /* TASK_SWAPPER */
4506                 vm_object_unlock(backing_object);
4507         } else {
4508
4509                 /*
4510                  *      Drop locks so that we can deallocate
4511                  *      the backing object.
4512                  */
4513
4514 #if     TASK_SWAPPER
4515                 if (object->res_count == 0) {
4516                         /* XXX get a reference for the deallocate below */
4517                         vm_object_res_reference(backing_object);
4518                 }
4519 #endif  /* TASK_SWAPPER */
4520                 vm_object_unlock(object);
4521                 vm_object_unlock(backing_object);
4522                 vm_object_deallocate(backing_object);
4523
4524                 /*
4525                  *      Relock object. We don't have to reverify
4526                  *      its state since vm_object_collapse will
4527                  *      do that for us as it starts at the
4528                  *      top of its loop.
4529                  */
4530
4531                 vm_object_lock(object);
4532         }
4533
4534         object_bypasses++;
4535 }
4536
4537
4538 /*
4539  *      vm_object_collapse:
4540  *
4541  *      Perform an object collapse or an object bypass if appropriate.
4542  *      The real work of collapsing and bypassing is performed in
4543  *      the routines vm_object_do_collapse and vm_object_do_bypass.
4544  *
4545  *      Requires that the object be locked and the page queues be unlocked.
4546  *
4547  */
4548 static unsigned long vm_object_collapse_calls = 0;
4549 static unsigned long vm_object_collapse_objects = 0;
4550 static unsigned long vm_object_collapse_do_collapse = 0;
4551 static unsigned long vm_object_collapse_do_bypass = 0;
4552 static unsigned long vm_object_collapse_delays = 0;
4553 __private_extern__ void
4554 vm_object_collapse(
4555         register vm_object_t                    object,
4556         register vm_object_offset_t             hint_offset,
4557         boolean_t                               can_bypass)
4558 {
4559         register vm_object_t                    backing_object;
4560         register unsigned int                   rcount;
4561         register unsigned int                   size;
4562         vm_object_t                             original_object;
4563         int                                     object_lock_type;
4564         int                                     backing_object_lock_type;
4565
4566         vm_object_collapse_calls++;
4567
4568         if (! vm_object_collapse_allowed &&
4569             ! (can_bypass && vm_object_bypass_allowed)) {
4570                 return;
4571         }
4572
4573         XPR(XPR_VM_OBJECT, "vm_object_collapse, obj 0x%X\n",
4574                 object, 0,0,0,0);
4575
4576         if (object == VM_OBJECT_NULL)
4577                 return;
4578
4579         original_object = object;
4580
4581         /*
4582          * The top object was locked "exclusive" by the caller.
4583          * In the first pass, to determine if we can collapse the shadow chain,
4584          * take a "shared" lock on the shadow objects.  If we can collapse,
4585          * we'll have to go down the chain again with exclusive locks.
4586          */
4587         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4588         backing_object_lock_type = OBJECT_LOCK_SHARED;
4589
4590 retry:
4591         object = original_object;
4592         vm_object_lock_assert_exclusive(object);
4593
4594         while (TRUE) {
4595                 vm_object_collapse_objects++;
4596                 /*
4597                  *      Verify that the conditions are right for either
4598                  *      collapse or bypass:
4599                  */
4600
4601                 /*
4602                  *      There is a backing object, and
4603                  */
4604
4605                 backing_object = object->shadow;
4606                 if (backing_object == VM_OBJECT_NULL) {
4607                         if (object != original_object) {
4608                                 vm_object_unlock(object);
4609                         }
4610                         return;
4611                 }
4612                 if (backing_object_lock_type == OBJECT_LOCK_SHARED) {
4613                         vm_object_lock_shared(backing_object);
4614                 } else {
4615                         vm_object_lock(backing_object);
4616                 }
4617
4618                 /*
4619                  *      No pages in the object are currently
4620                  *      being paged out, and
4621                  */
4622                 if (object->paging_in_progress != 0 ||
4623                     object->activity_in_progress != 0) {
4624                         /* try and collapse the rest of the shadow chain */
4625                         if (object != original_object) {
4626                                 vm_object_unlock(object);
4627                         }
4628                         object = backing_object;
4629                         object_lock_type = backing_object_lock_type;
4630                         continue;
4631                 }
4632
4633                 /*
4634                  *      ...
4635                  *              The backing object is not read_only,
4636                  *              and no pages in the backing object are
4637                  *              currently being paged out.
4638                  *              The backing object is internal.
4639                  *
4640                  */
4641
4642                 if (!backing_object->internal ||
4643                     backing_object->paging_in_progress != 0 ||
4644                     backing_object->activity_in_progress != 0) {
4645                         /* try and collapse the rest of the shadow chain */
4646                         if (object != original_object) {
4647                                 vm_object_unlock(object);
4648                         }
4649                         object = backing_object;
4650                         object_lock_type = backing_object_lock_type;
4651                         continue;
4652                 }
4653
4654                 /*
4655                  *      The backing object can't be a copy-object:
4656                  *      the shadow_offset for the copy-object must stay
4657                  *      as 0.  Furthermore (for the 'we have all the
4658                  *      pages' case), if we bypass backing_object and
4659                  *      just shadow the next object in the chain, old
4660                  *      pages from that object would then have to be copied
4661                  *      BOTH into the (former) backing_object and into the
4662                  *      parent object.
4663                  */
4664                 if (backing_object->shadow != VM_OBJECT_NULL &&
4665                     backing_object->shadow->copy == backing_object) {
4666                         /* try and collapse the rest of the shadow chain */
4667                         if (object != original_object) {
4668                                 vm_object_unlock(object);
4669                         }
4670                         object = backing_object;
4671                         object_lock_type = backing_object_lock_type;
4672                         continue;
4673                 }
4674
4675                 /*
4676                  *      We can now try to either collapse the backing
4677                  *      object (if the parent is the only reference to
4678                  *      it) or (perhaps) remove the parent's reference
4679                  *      to it.
4680                  *
4681                  *      If there is exactly one reference to the backing
4682                  *      object, we may be able to collapse it into the
4683                  *      parent.
4684                  *
4685                  *      If MACH_PAGEMAP is defined:
4686                  *      The parent must not have a pager created for it,
4687                  *      since collapsing a backing_object dumps new pages
4688                  *      into the parent that its pager doesn't know about
4689                  *      (and the collapse code can't merge the existence
4690                  *      maps).
4691                  *      Otherwise:
4692                  *      As long as one of the objects is still not known
4693                  *      to the pager, we can collapse them.
4694                  */
4695                 if (backing_object->ref_count == 1 &&
4696                     (!object->pager_created
4697 #if     !MACH_PAGEMAP
4698                      || !backing_object->pager_created
4699 #endif  /*!MACH_PAGEMAP */
4700                     ) && vm_object_collapse_allowed) {
4701
4702                         /*
4703                          * We need the exclusive lock on the VM objects.
4704                          */
4705                         if (backing_object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
4706                                 /*
4707                                  * We have an object and its shadow locked
4708                                  * "shared".  We can't just upgrade the locks
4709                                  * to "exclusive", as some other thread might
4710                                  * also have these objects locked "shared" and
4711                                  * attempt to upgrade one or the other to
4712                                  * "exclusive".  The upgrades would block
4713                                  * forever waiting for the other "shared" locks
4714                                  * to get released.
4715                                  * So we have to release the locks and go
4716                                  * down the shadow chain again (since it could
4717                                  * have changed) with "exclusive" locking.
4718                                  */
4719                                 vm_object_unlock(backing_object);
4720                                 if (object != original_object)
4721                                         vm_object_unlock(object);
4722                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4723                                 backing_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4724                                 goto retry;
4725                         }
4726
4727                         XPR(XPR_VM_OBJECT,
4728                    "vm_object_collapse: %x to %x, pager %x, pager_control %x\n",
4729                                 backing_object, object,
4730                                 backing_object->pager,
4731                                 backing_object->pager_control, 0);
4732
4733                         /*
4734                          *      Collapse the object with its backing
4735                          *      object, and try again with the object's
4736                          *      new backing object.
4737                          */
4738
4739                         vm_object_do_collapse(object, backing_object);
4740                         vm_object_collapse_do_collapse++;
4741                         continue;
4742                 }
4743
4744                 /*
4745                  *      Collapsing the backing object was not possible
4746                  *      or permitted, so let's try bypassing it.
4747                  */
4748
4749                 if (! (can_bypass && vm_object_bypass_allowed)) {
4750                         /* try and collapse the rest of the shadow chain */
4751                         if (object != original_object) {
4752                                 vm_object_unlock(object);
4753                         }
4754                         object = backing_object;
4755                         object_lock_type = backing_object_lock_type;
4756                         continue;
4757                 }
4758
4759
4760                 /*
4761                  *      If the object doesn't have all its pages present,
4762                  *      we have to make sure no pages in the backing object
4763                  *      "show through" before bypassing it.
4764                  */
4765                 size = atop(object->size);
4766                 rcount = object->resident_page_count;
4767                 if (rcount != size) {
4768                         vm_object_offset_t      offset;
4769                         vm_object_offset_t      backing_offset;
4770                         unsigned int            backing_rcount;
4771                         unsigned int            lookups = 0;
4772
4773                         /*
4774                          *      If the backing object has a pager but no pagemap,
4775                          *      then we cannot bypass it, because we don't know
4776                          *      what pages it has.
4777                          */
4778                         if (backing_object->pager_created
4779 #if     MACH_PAGEMAP
4780                             && (backing_object->existence_map == VM_EXTERNAL_NULL)
4781 #endif  /* MACH_PAGEMAP */
4782                                 ) {
4783                                 /* try and collapse the rest of the shadow chain */
4784                                 if (object != original_object) {
4785                                         vm_object_unlock(object);
4786                                 }
4787                                 object = backing_object;
4788                                 object_lock_type = backing_object_lock_type;
4789                                 continue;
4790                         }
4791
4792                         /*
4793                          *      If the object has a pager but no pagemap,
4794                          *      then we cannot bypass it, because we don't know
4795                          *      what pages it has.
4796                          */
4797                         if (object->pager_created
4798 #if     MACH_PAGEMAP
4799                             && (object->existence_map == VM_EXTERNAL_NULL)
4800 #endif  /* MACH_PAGEMAP */
4801                                 ) {
4802                                 /* try and collapse the rest of the shadow chain */
4803                                 if (object != original_object) {
4804                                         vm_object_unlock(object);
4805                                 }
4806                                 object = backing_object;
4807                                 object_lock_type = backing_object_lock_type;
4808                                 continue;
4809                         }
4810
4811                         /*
4812                          *      If all of the pages in the backing object are
4813                          *      shadowed by the parent object, the parent
4814                          *      object no longer has to shadow the backing
4815                          *      object; it can shadow the next one in the
4816                          *      chain.
4817                          *
4818                          *      If the backing object has existence info,
4819                          *      we must check examine its existence info
4820                          *      as well.
4821                          *
4822                          */
4823
4824                         backing_offset = object->shadow_offset;
4825                         backing_rcount = backing_object->resident_page_count;
4826
4827 #if     MACH_PAGEMAP
4828 #define EXISTS_IN_OBJECT(obj, off, rc) \
4829         (vm_external_state_get((obj)->existence_map, \
4830          (vm_offset_t)(off)) == VM_EXTERNAL_STATE_EXISTS || \
4831          ((rc) && ++lookups && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--))
4832 #else
4833 #define EXISTS_IN_OBJECT(obj, off, rc) \
4834         (((rc) && ++lookups && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--))
4835 #endif  /* MACH_PAGEMAP */
4836
4837                         /*
4838                          * Check the hint location first
4839                          * (since it is often the quickest way out of here).
4840                          */
4841                         if (object->cow_hint != ~(vm_offset_t)0)
4842                                 hint_offset = (vm_object_offset_t)object->cow_hint;
4843                         else
4844                                 hint_offset = (hint_offset > 8 * PAGE_SIZE_64) ?
4845                                               (hint_offset - 8 * PAGE_SIZE_64) : 0;
4846
4847                         if (EXISTS_IN_OBJECT(backing_object, hint_offset +
4848                                              backing_offset, backing_rcount) &&
4849                             !EXISTS_IN_OBJECT(object, hint_offset, rcount)) {
4850                                 /* dependency right at the hint */
4851                                 object->cow_hint = (vm_offset_t) hint_offset; /* atomic */
4852                                 /* try and collapse the rest of the shadow chain */
4853                                 if (object != original_object) {
4854                                         vm_object_unlock(object);
4855                                 }
4856                                 object = backing_object;
4857                                 object_lock_type = backing_object_lock_type;
4858                                 continue;
4859                         }
4860
4861                         /*
4862                          * If the object's window onto the backing_object
4863                          * is large compared to the number of resident
4864                          * pages in the backing object, it makes sense to
4865                          * walk the backing_object's resident pages first.
4866                          *
4867                          * NOTE: Pages may be in both the existence map and
4868                          * resident.  So, we can't permanently decrement
4869                          * the rcount here because the second loop may
4870                          * find the same pages in the backing object'
4871                          * existence map that we found here and we would
4872                          * double-decrement the rcount.  We also may or
4873                          * may not have found the
4874                          */
4875                         if (backing_rcount &&
4876 #if     MACH_PAGEMAP
4877                             size > ((backing_object->existence_map) ?
4878                              backing_rcount : (backing_rcount >> 1))
4879 #else
4880                             size > (backing_rcount >> 1)
4881 #endif  /* MACH_PAGEMAP */
4882                                 ) {
4883                                 unsigned int rc = rcount;
4884                                 vm_page_t p;
4885
4886                                 backing_rcount = backing_object->resident_page_count;
4887                                 p = (vm_page_t)queue_first(&backing_object->memq);
4888                                 do {
4889                                         /* Until we get more than one lookup lock */
4890                                         if (lookups > 256) {
4891                                                 vm_object_collapse_delays++;
4892                                                 lookups = 0;
4893                                                 mutex_pause(0);
4894                                         }
4895
4896                                         offset = (p->offset - backing_offset);
4897                                         if (offset < object->size &&
4898                                             offset != hint_offset &&
4899                                             !EXISTS_IN_OBJECT(object, offset, rc)) {
4900                                                 /* found a dependency */
4901                                                 object->cow_hint = (vm_offset_t) offset; /* atomic */
4902
4903                                                 break;
4904                                         }
4905                                         p = (vm_page_t) queue_next(&p->listq);
4906
4907                                 } while (--backing_rcount);
4908                                 if (backing_rcount != 0 ) {
4909                                         /* try and collapse the rest of the shadow chain */
4910                                         if (object != original_object) {
4911                                                 vm_object_unlock(object);
4912                                         }
4913                                         object = backing_object;
4914                                         object_lock_type = backing_object_lock_type;
4915                                         continue;
4916                                 }
4917                         }
4918
4919                         /*
4920                          * Walk through the offsets looking for pages in the
4921                          * backing object that show through to the object.
4922                          */
4923                         if (backing_rcount
4924 #if MACH_PAGEMAP
4925                             || backing_object->existence_map
4926 #endif  /* MACH_PAGEMAP */
4927                                 ) {
4928                                 offset = hint_offset;
4929
4930                                 while((offset =
4931                                       (offset + PAGE_SIZE_64 < object->size) ?
4932                                       (offset + PAGE_SIZE_64) : 0) != hint_offset) {
4933
4934                                         /* Until we get more than one lookup lock */
4935                                         if (lookups > 256) {
4936                                                 vm_object_collapse_delays++;
4937                                                 lookups = 0;
4938                                                 mutex_pause(0);
4939                                         }
4940
4941                                         if (EXISTS_IN_OBJECT(backing_object, offset +
4942                                             backing_offset, backing_rcount) &&
4943                                             !EXISTS_IN_OBJECT(object, offset, rcount)) {
4944                                                 /* found a dependency */
4945                                                 object->cow_hint = (vm_offset_t) offset; /* atomic */
4946                                                 break;
4947                                         }
4948                                 }
4949                                 if (offset != hint_offset) {
4950                                         /* try and collapse the rest of the shadow chain */
4951                                         if (object != original_object) {
4952                                                 vm_object_unlock(object);
4953                                         }
4954                                         object = backing_object;
4955                                         object_lock_type = backing_object_lock_type;
4956                                         continue;
4957                                 }
4958                         }
4959                 }
4960
4961                 /*
4962                  * We need "exclusive" locks on the 2 VM objects.
4963                  */
4964                 if (backing_object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
4965                         vm_object_unlock(backing_object);
4966                         if (object != original_object)
4967                                 vm_object_unlock(object);
4968                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4969                         backing_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4970                         goto retry;
4971                 }
4972
4973                 /* reset the offset hint for any objects deeper in the chain */
4974                 object->cow_hint = (vm_offset_t)0;
4975
4976                 /*
4977                  *      All interesting pages in the backing object
4978                  *      already live in the parent or its pager.
4979                  *      Thus we can bypass the backing object.
4980                  */
4981
4982                 vm_object_do_bypass(object, backing_object);
4983                 vm_object_collapse_do_bypass++;
4984
4985                 /*
4986                  *      Try again with this object's new backing object.
4987                  */
4988
4989                 continue;
4990         }
4991
4992         if (object != original_object) {
4993                 vm_object_unlock(object);
4994         }
4995 }
4996
4997 /*
4998  *      Routine:        vm_object_page_remove: [internal]
4999  *      Purpose:
5000  *              Removes all physical pages in the specified
5001  *              object range from the object's list of pages.
5002  *
5003  *      In/out conditions:
5004  *              The object must be locked.
5005  *              The object must not have paging_in_progress, usually
5006  *              guaranteed by not having a pager.
5007  */
5008 unsigned int vm_object_page_remove_lookup = 0;
5009 unsigned int vm_object_page_remove_iterate = 0;
5010
5011 __private_extern__ void
5012 vm_object_page_remove(
5013         register vm_object_t            object,
5014         register vm_object_offset_t     start,
5015         register vm_object_offset_t     end)
5016 {
5017         register vm_page_t      p, next;
5018
5019         /*
5020          *      One and two page removals are most popular.
5021          *      The factor of 16 here is somewhat arbitrary.
5022          *      It balances vm_object_lookup vs iteration.
5023          */
5024
5025         if (atop_64(end - start) < (unsigned)object->resident_page_count/16) {
5026                 vm_object_page_remove_lookup++;
5027
5028                 for (; start < end; start += PAGE_SIZE_64) {
5029                         p = vm_page_lookup(object, start);
5030                         if (p != VM_PAGE_NULL) {
5031                                 assert(!p->cleaning && !p->pageout);
5032                                 if (!p->fictitious && p->pmapped)
5033                                         pmap_disconnect(p->phys_page);
5034                                 VM_PAGE_FREE(p);
5035                         }
5036                 }
5037         } else {
5038                 vm_object_page_remove_iterate++;
5039
5040                 p = (vm_page_t) queue_first(&object->memq);
5041                 while (!queue_end(&object->memq, (queue_entry_t) p)) {
5042                         next = (vm_page_t) queue_next(&p->listq);
5043                         if ((start <= p->offset) && (p->offset < end)) {
5044                                 assert(!p->cleaning && !p->pageout);
5045                                 if (!p->fictitious && p->pmapped)
5046                                         pmap_disconnect(p->phys_page);
5047                                 VM_PAGE_FREE(p);
5048                         }
5049                         p = next;
5050                 }
5051         }
5052 }
5053
5054
5055 /*
5056  *      Routine:        vm_object_coalesce
5057  *      Function:       Coalesces two objects backing up adjoining
5058  *                      regions of memory into a single object.
5059  *
5060  *      returns TRUE if objects were combined.
5061  *
5062  *      NOTE:   Only works at the moment if the second object is NULL -
5063  *              if it's not, which object do we lock first?
5064  *
5065  *      Parameters:
5066  *              prev_object     First object to coalesce
5067  *              prev_offset     Offset into prev_object
5068  *              next_object     Second object into coalesce
5069  *              next_offset     Offset into next_object
5070  *
5071  *              prev_size       Size of reference to prev_object
5072  *              next_size       Size of reference to next_object
5073  *
5074  *      Conditions:
5075  *      The object(s) must *not* be locked. The map must be locked
5076  *      to preserve the reference to the object(s).
5077  */
5078 static int vm_object_coalesce_count = 0;
5079
5080 __private_extern__ boolean_t
5081 vm_object_coalesce(
5082         register vm_object_t            prev_object,
5083         vm_object_t                     next_object,
5084         vm_object_offset_t              prev_offset,
5085         __unused vm_object_offset_t next_offset,
5086         vm_object_size_t                prev_size,
5087         vm_object_size_t                next_size)
5088 {
5089         vm_object_size_t        newsize;
5090
5091 #ifdef  lint
5092         next_offset++;
5093 #endif  /* lint */
5094
5095         if (next_object != VM_OBJECT_NULL) {
5096                 return(FALSE);
5097         }
5098
5099         if (prev_object == VM_OBJECT_NULL) {
5100                 return(TRUE);
5101         }
5102
5103         XPR(XPR_VM_OBJECT,
5104        "vm_object_coalesce: 0x%X prev_off 0x%X prev_size 0x%X next_size 0x%X\n",
5105                 prev_object, prev_offset, prev_size, next_size, 0);
5106
5107         vm_object_lock(prev_object);
5108
5109         /*
5110          *      Try to collapse the object first
5111          */
5112         vm_object_collapse(prev_object, prev_offset, TRUE);
5113
5114         /*
5115          *      Can't coalesce if pages not mapped to
5116          *      prev_entry may be in use any way:
5117          *      . more than one reference
5118          *      . paged out
5119          *      . shadows another object
5120          *      . has a copy elsewhere
5121          *      . is purgeable
5122          *      . paging references (pages might be in page-list)
5123          */
5124
5125         if ((prev_object->ref_count > 1) ||
5126             prev_object->pager_created ||
5127             (prev_object->shadow != VM_OBJECT_NULL) ||
5128             (prev_object->copy != VM_OBJECT_NULL) ||
5129             (prev_object->true_share != FALSE) ||
5130             (prev_object->purgable != VM_PURGABLE_DENY) ||
5131             (prev_object->paging_in_progress != 0) ||
5132             (prev_object->activity_in_progress != 0)) {
5133                 vm_object_unlock(prev_object);
5134                 return(FALSE);
5135         }
5136
5137         vm_object_coalesce_count++;
5138
5139         /*
5140          *      Remove any pages that may still be in the object from
5141          *      a previous deallocation.
5142          */
5143         vm_object_page_remove(prev_object,
5144                 prev_offset + prev_size,
5145                 prev_offset + prev_size + next_size);
5146
5147         /*
5148          *      Extend the object if necessary.
5149          */
5150         newsize = prev_offset + prev_size + next_size;
5151         if (newsize > prev_object->size) {
5152 #if     MACH_PAGEMAP
5153                 /*
5154                  *      We cannot extend an object that has existence info,
5155                  *      since the existence info might then fail to cover
5156                  *      the entire object.
5157                  *
5158                  *      This assertion must be true because the object
5159                  *      has no pager, and we only create existence info
5160                  *      for objects with pagers.
5161                  */
5162                 assert(prev_object->existence_map == VM_EXTERNAL_NULL);
5163 #endif  /* MACH_PAGEMAP */
5164                 prev_object->size = newsize;
5165         }
5166
5167         vm_object_unlock(prev_object);
5168         return(TRUE);
5169 }
5170
5171 /*
5172  *      Attach a set of physical pages to an object, so that they can
5173  *      be mapped by mapping the object.  Typically used to map IO memory.
5174  *
5175  *      The mapping function and its private data are used to obtain the
5176  *      physical addresses for each page to be mapped.
5177  */
5178 void
5179 vm_object_page_map(
5180         vm_object_t             object,
5181         vm_object_offset_t      offset,
5182         vm_object_size_t        size,
5183         vm_object_offset_t      (*map_fn)(void *map_fn_data,
5184                 vm_object_offset_t offset),
5185                 void            *map_fn_data)   /* private to map_fn */
5186 {
5187         int64_t num_pages;
5188         int     i;
5189         vm_page_t       m;
5190         vm_page_t       old_page;
5191         vm_object_offset_t      addr;
5192
5193         num_pages = atop_64(size);
5194
5195         for (i = 0; i < num_pages; i++, offset += PAGE_SIZE_64) {
5196
5197             addr = (*map_fn)(map_fn_data, offset);
5198
5199             while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL)
5200                 vm_page_more_fictitious();
5201
5202             vm_object_lock(object);
5203             if ((old_page = vm_page_lookup(object, offset))
5204                         != VM_PAGE_NULL)
5205             {
5206                     VM_PAGE_FREE(old_page);
5207             }
5208
5209             assert((ppnum_t) addr == addr);
5210             vm_page_init(m, (ppnum_t) addr, FALSE);
5211             /*
5212              * private normally requires lock_queues but since we
5213              * are initializing the page, its not necessary here
5214              */
5215             m->private = TRUE;          /* don`t free page */
5216             m->wire_count = 1;
5217             vm_page_insert(m, object, offset);
5218
5219             PAGE_WAKEUP_DONE(m);
5220             vm_object_unlock(object);
5221         }
5222 }
5223
5224 #include <mach_kdb.h>
5225
5226 #if     MACH_KDB
5227 #include <ddb/db_output.h>
5228 #include <vm/vm_print.h>
5229
5230 #define printf  kdbprintf
5231
5232 extern boolean_t        vm_object_cached(
5233                                 vm_object_t object);
5234
5235 extern void             print_bitstring(
5236                                 char byte);
5237
5238 boolean_t       vm_object_print_pages = FALSE;
5239
5240 void
5241 print_bitstring(
5242         char byte)
5243 {
5244         printf("%c%c%c%c%c%c%c%c",
5245                ((byte & (1 << 0)) ? '1' : '0'),
5246                ((byte & (1 << 1)) ? '1' : '0'),
5247                ((byte & (1 << 2)) ? '1' : '0'),
5248                ((byte & (1 << 3)) ? '1' : '0'),
5249                ((byte & (1 << 4)) ? '1' : '0'),
5250                ((byte & (1 << 5)) ? '1' : '0'),
5251                ((byte & (1 << 6)) ? '1' : '0'),
5252                ((byte & (1 << 7)) ? '1' : '0'));
5253 }
5254
5255 boolean_t
5256 vm_object_cached(
5257         __unused register vm_object_t object)
5258 {
5259 #if VM_OBJECT_CACHE
5260         register vm_object_t o;
5261
5262         queue_iterate(&vm_object_cached_list, o, vm_object_t, cached_list) {
5263                 if (object == o) {
5264                         return TRUE;
5265                 }
5266         }
5267 #endif
5268         return FALSE;
5269 }
5270
5271 #if     MACH_PAGEMAP
5272 /*
5273  *      vm_external_print:      [ debug ]
5274  */
5275 void
5276 vm_external_print(
5277         vm_external_map_t       emap,
5278         vm_object_size_t        size)
5279 {
5280         if (emap == VM_EXTERNAL_NULL) {
5281                 printf("0  ");
5282         } else {
5283                 vm_object_size_t existence_size = stob(size);
5284                 printf("{ size=%lld, map=[", (uint64_t) existence_size);
5285                 if (existence_size > 0) {
5286                         print_bitstring(emap[0]);
5287                 }
5288                 if (existence_size > 1) {
5289                         print_bitstring(emap[1]);
5290                 }
5291                 if (existence_size > 2) {
5292                         printf("...");
5293                         print_bitstring(emap[existence_size-1]);
5294                 }
5295                 printf("] }\n");
5296         }
5297         return;
5298 }
5299 #endif  /* MACH_PAGEMAP */
5300
5301 int
5302 vm_follow_object(
5303         vm_object_t object)
5304 {
5305         int count = 0;
5306         int orig_db_indent = db_indent;
5307
5308         while (TRUE) {
5309                 if (object == VM_OBJECT_NULL) {
5310                         db_indent = orig_db_indent;
5311                         return count;
5312                 }
5313
5314                 count += 1;
5315
5316                 iprintf("object 0x%x", object);
5317                 printf(", shadow=0x%x", object->shadow);
5318                 printf(", copy=0x%x", object->copy);
5319                 printf(", pager=0x%x", object->pager);
5320                 printf(", ref=%d\n", object->ref_count);
5321
5322                 db_indent += 2;
5323                 object = object->shadow;
5324         }
5325
5326 }
5327
5328 /*
5329  *      vm_object_print:        [ debug ]
5330  */
5331 void
5332 vm_object_print(db_expr_t db_addr, __unused boolean_t have_addr,
5333                 __unused db_expr_t arg_count, __unused char *modif)
5334 {
5335         vm_object_t     object;
5336         register vm_page_t p;
5337         const char *s;
5338
5339         register int count;
5340
5341         object = (vm_object_t) (long) db_addr;
5342         if (object == VM_OBJECT_NULL)
5343                 return;
5344
5345         iprintf("object 0x%x\n", object);
5346
5347         db_indent += 2;
5348
5349         iprintf("size=0x%x", object->size);
5350         printf(", memq_hint=%p", object->memq_hint);
5351         printf(", ref_count=%d\n", object->ref_count);
5352         iprintf("");
5353 #if     TASK_SWAPPER
5354         printf("res_count=%d, ", object->res_count);
5355 #endif  /* TASK_SWAPPER */
5356         printf("resident_page_count=%d\n", object->resident_page_count);
5357
5358         iprintf("shadow=0x%x", object->shadow);
5359         if (object->shadow) {
5360                 register int i = 0;
5361                 vm_object_t shadow = object;
5362                 while((shadow = shadow->shadow))
5363                         i++;
5364                 printf(" (depth %d)", i);
5365         }
5366         printf(", copy=0x%x", object->copy);
5367         printf(", shadow_offset=0x%x", object->shadow_offset);
5368         printf(", last_alloc=0x%x\n", object->last_alloc);
5369
5370         iprintf("pager=0x%x", object->pager);
5371         printf(", paging_offset=0x%x", object->paging_offset);
5372         printf(", pager_control=0x%x\n", object->pager_control);
5373
5374         iprintf("copy_strategy=%d[", object->copy_strategy);
5375         switch (object->copy_strategy) {
5376                 case MEMORY_OBJECT_COPY_NONE:
5377                 printf("copy_none");
5378                 break;
5379
5380                 case MEMORY_OBJECT_COPY_CALL:
5381                 printf("copy_call");
5382                 break;
5383
5384                 case MEMORY_OBJECT_COPY_DELAY:
5385                 printf("copy_delay");
5386                 break;
5387
5388                 case MEMORY_OBJECT_COPY_SYMMETRIC:
5389                 printf("copy_symmetric");
5390                 break;
5391
5392                 case MEMORY_OBJECT_COPY_INVALID:
5393                 printf("copy_invalid");
5394                 break;
5395
5396                 default:
5397                 printf("?");
5398         }
5399         printf("]");
5400
5401         iprintf("all_wanted=0x%x<", object->all_wanted);
5402         s = "";
5403         if (vm_object_wanted(object, VM_OBJECT_EVENT_INITIALIZED)) {
5404                 printf("%sinit", s);
5405                 s = ",";
5406         }
5407         if (vm_object_wanted(object, VM_OBJECT_EVENT_PAGER_READY)) {
5408                 printf("%sready", s);
5409                 s = ",";
5410         }
5411         if (vm_object_wanted(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS)) {
5412                 printf("%spaging", s);
5413                 s = ",";
5414         }
5415         if (vm_object_wanted(object, VM_OBJECT_EVENT_LOCK_IN_PROGRESS)) {
5416                 printf("%slock", s);
5417                 s = ",";
5418         }
5419         if (vm_object_wanted(object, VM_OBJECT_EVENT_UNCACHING)) {
5420                 printf("%suncaching", s);
5421                 s = ",";
5422         }
5423         if (vm_object_wanted(object, VM_OBJECT_EVENT_COPY_CALL)) {
5424                 printf("%scopy_call", s);
5425                 s = ",";
5426         }
5427         if (vm_object_wanted(object, VM_OBJECT_EVENT_CACHING)) {
5428                 printf("%scaching", s);
5429                 s = ",";
5430         }
5431         printf(">");
5432         printf(", paging_in_progress=%d\n", object->paging_in_progress);
5433         printf(", activity_in_progress=%d\n", object->activity_in_progress);
5434
5435         iprintf("%screated, %sinit, %sready, %spersist, %strusted, %spageout, %s, %s\n",
5436                 (object->pager_created ? "" : "!"),
5437                 (object->pager_initialized ? "" : "!"),
5438                 (object->pager_ready ? "" : "!"),
5439                 (object->can_persist ? "" : "!"),
5440                 (object->pager_trusted ? "" : "!"),
5441                 (object->pageout ? "" : "!"),
5442                 (object->internal ? "internal" : "external"),
5443                 (object->temporary ? "temporary" : "permanent"));
5444         iprintf("%salive, %spurgeable, %spurgeable_volatile, %spurgeable_empty, %sshadowed, %scached, %sprivate\n",
5445                 (object->alive ? "" : "!"),
5446                 ((object->purgable != VM_PURGABLE_DENY) ? "" : "!"),
5447                 ((object->purgable == VM_PURGABLE_VOLATILE) ? "" : "!"),
5448                 ((object->purgable == VM_PURGABLE_EMPTY) ? "" : "!"),
5449                 (object->shadowed ? "" : "!"),
5450                 (vm_object_cached(object) ? "" : "!"),
5451                 (object->private ? "" : "!"));
5452         iprintf("%sadvisory_pageout, %ssilent_overwrite\n",
5453                 (object->advisory_pageout ? "" : "!"),
5454                 (object->silent_overwrite ? "" : "!"));
5455
5456 #if     MACH_PAGEMAP
5457         iprintf("existence_map=");
5458         vm_external_print(object->existence_map, object->size);
5459 #endif  /* MACH_PAGEMAP */
5460 #if     MACH_ASSERT
5461         iprintf("paging_object=0x%x\n", object->paging_object);
5462 #endif  /* MACH_ASSERT */
5463
5464         if (vm_object_print_pages) {
5465                 count = 0;
5466                 p = (vm_page_t) queue_first(&object->memq);
5467                 while (!queue_end(&object->memq, (queue_entry_t) p)) {
5468                         if (count == 0) {
5469                                 iprintf("memory:=");
5470                         } else if (count == 2) {
5471                                 printf("\n");
5472                                 iprintf(" ...");
5473                                 count = 0;
5474                         } else {
5475                                 printf(",");
5476                         }
5477                         count++;
5478
5479                         printf("(off=0x%llX,page=%p)", p->offset, p);
5480                         p = (vm_page_t) queue_next(&p->listq);
5481                 }
5482                 if (count != 0) {
5483                         printf("\n");
5484                 }
5485         }
5486         db_indent -= 2;
5487 }
5488
5489
5490 /*
5491  *      vm_object_find          [ debug ]
5492  *
5493  *      Find all tasks which reference the given vm_object.
5494  */
5495
5496 boolean_t vm_object_find(vm_object_t object);
5497 boolean_t vm_object_print_verbose = FALSE;
5498
5499 boolean_t
5500 vm_object_find(
5501         vm_object_t     object)
5502 {
5503         task_t task;
5504         vm_map_t map;
5505         vm_map_entry_t entry;
5506         boolean_t found = FALSE;
5507
5508         queue_iterate(&tasks, task, task_t, tasks) {
5509                 map = task->map;
5510                 for (entry = vm_map_first_entry(map);
5511                          entry && entry != vm_map_to_entry(map);
5512                          entry = entry->vme_next) {
5513
5514                         vm_object_t obj;
5515
5516                         /*
5517                          * For the time being skip submaps,
5518                          * only the kernel can have submaps,
5519                          * and unless we are interested in
5520                          * kernel objects, we can simply skip
5521                          * submaps. See sb/dejan/nmk18b7/src/mach_kernel/vm
5522                          * for a full solution.
5523                          */
5524                         if (entry->is_sub_map)
5525                                 continue;
5526                         if (entry)
5527                                 obj = entry->object.vm_object;
5528                         else
5529                                 continue;
5530
5531                         while (obj != VM_OBJECT_NULL) {
5532                                 if (obj == object) {
5533                                         if (!found) {
5534                                                 printf("TASK\t\tMAP\t\tENTRY\n");
5535                                                 found = TRUE;
5536                                         }
5537                                         printf("0x%x\t0x%x\t0x%x\n",
5538                                                    task, map, entry);
5539                                 }
5540                                 obj = obj->shadow;
5541                         }
5542                 }
5543         }
5544
5545         return(found);
5546 }
5547
5548 #endif  /* MACH_KDB */
5549
5550 kern_return_t
5551 vm_object_populate_with_private(
5552                 vm_object_t             object,
5553                 vm_object_offset_t      offset,
5554                 ppnum_t                 phys_page,
5555                 vm_size_t               size)
5556 {
5557         ppnum_t                 base_page;
5558         vm_object_offset_t      base_offset;
5559
5560
5561         if(!object->private)
5562                 return KERN_FAILURE;
5563
5564         base_page = phys_page;
5565
5566         vm_object_lock(object);
5567         if(!object->phys_contiguous) {
5568                 vm_page_t       m;
5569                 if((base_offset = trunc_page_64(offset)) != offset) {
5570                         vm_object_unlock(object);
5571                         return KERN_FAILURE;
5572                 }
5573                 base_offset += object->paging_offset;
5574                 while(size) {
5575                         m = vm_page_lookup(object, base_offset);
5576                         if(m != VM_PAGE_NULL) {
5577                                 if(m->fictitious) {
5578                                         if (m->phys_page != vm_page_guard_addr) {
5579
5580                                                 vm_page_lockspin_queues();
5581                                                 m->private = TRUE;
5582                                                 vm_page_unlock_queues();
5583
5584                                                 m->fictitious = FALSE;
5585                                                 m->phys_page = base_page;
5586                                                 if(!m->busy) {
5587                                                         m->busy = TRUE;
5588                                                 }
5589                                                 if(!m->absent) {
5590                                                         m->absent = TRUE;
5591                                                 }
5592                                                 m->list_req_pending = TRUE;
5593                                         }
5594                                 } else if (m->phys_page != base_page) {
5595                                         if (m->pmapped) {
5596                                                 /*
5597                                                  * pmap call to clear old mapping
5598                                                  */
5599                                                 pmap_disconnect(m->phys_page);
5600                                         }
5601                                         m->phys_page = base_page;
5602                                 }
5603
5604                                 /*
5605                                  * ENCRYPTED SWAP:
5606                                  * We're not pointing to the same
5607                                  * physical page any longer and the
5608                                  * contents of the new one are not
5609                                  * supposed to be encrypted.
5610                                  * XXX What happens to the original
5611                                  * physical page. Is it lost ?
5612                                  */
5613                                 m->encrypted = FALSE;
5614
5615                         } else {
5616                                 while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL)
5617                                         vm_page_more_fictitious();
5618
5619                                 /*
5620                                  * private normally requires lock_queues but since we
5621                                  * are initializing the page, its not necessary here
5622                                  */
5623                                 m->private = TRUE;
5624                                 m->fictitious = FALSE;
5625                                 m->phys_page = base_page;
5626                                 m->list_req_pending = TRUE;
5627                                 m->absent = TRUE;
5628                                 m->unusual = TRUE;
5629
5630                                 vm_page_insert(m, object, base_offset);
5631                         }
5632                         base_page++;                                                                    /* Go to the next physical page */
5633                         base_offset += PAGE_SIZE;
5634                         size -= PAGE_SIZE;
5635                 }
5636         } else {
5637                 /* NOTE: we should check the original settings here */
5638                 /* if we have a size > zero a pmap call should be made */
5639                 /* to disable the range */
5640
5641                 /* pmap_? */
5642
5643                 /* shadows on contiguous memory are not allowed */
5644                 /* we therefore can use the offset field */
5645                 object->shadow_offset = (vm_object_offset_t)phys_page << PAGE_SHIFT;
5646                 object->size = size;
5647         }
5648         vm_object_unlock(object);
5649         return KERN_SUCCESS;
5650 }
5651
5652 /*
5653  *      memory_object_free_from_cache:
5654  *
5655  *      Walk the vm_object cache list, removing and freeing vm_objects
5656  *      which are backed by the pager identified by the caller, (pager_ops).
5657  *      Remove up to "count" objects, if there are that may available
5658  *      in the cache.
5659  *
5660  *      Walk the list at most once, return the number of vm_objects
5661  *      actually freed.
5662  */
5663
5664 __private_extern__ kern_return_t
5665 memory_object_free_from_cache(
5666         __unused host_t         host,
5667         __unused memory_object_pager_ops_t pager_ops,
5668         int             *count)
5669 {
5670 #if VM_OBJECT_CACHE
5671         int     object_released = 0;
5672
5673         register vm_object_t object = VM_OBJECT_NULL;
5674         vm_object_t shadow;
5675
5676 /*
5677         if(host == HOST_NULL)
5678                 return(KERN_INVALID_ARGUMENT);
5679 */
5680
5681  try_again:
5682         vm_object_cache_lock();
5683
5684         queue_iterate(&vm_object_cached_list, object,
5685                                         vm_object_t, cached_list) {
5686                 if (object->pager &&
5687                     (pager_ops == object->pager->mo_pager_ops)) {
5688                         vm_object_lock(object);
5689                         queue_remove(&vm_object_cached_list, object,
5690                                         vm_object_t, cached_list);
5691                         vm_object_cached_count--;
5692
5693                         vm_object_cache_unlock();
5694                         /*
5695                         *       Since this object is in the cache, we know
5696                         *       that it is initialized and has only a pager's
5697                         *       (implicit) reference. Take a reference to avoid
5698                         *       recursive deallocations.
5699                         */
5700
5701                         assert(object->pager_initialized);
5702                         assert(object->ref_count == 0);
5703                         vm_object_lock_assert_exclusive(object);
5704                         object->ref_count++;
5705
5706                         /*
5707                         *       Terminate the object.
5708                         *       If the object had a shadow, we let
5709                         *       vm_object_deallocate deallocate it.
5710                         *       "pageout" objects have a shadow, but
5711                         *       maintain a "paging reference" rather
5712                         *       than a normal reference.
5713                         *       (We are careful here to limit recursion.)
5714                         */
5715                         shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
5716
5717                         if ((vm_object_terminate(object) == KERN_SUCCESS)
5718                                         && (shadow != VM_OBJECT_NULL)) {
5719                                 vm_object_deallocate(shadow);
5720                         }
5721
5722                         if(object_released++ == *count)
5723                                 return KERN_SUCCESS;
5724                         goto try_again;
5725                 }
5726         }
5727         vm_object_cache_unlock();
5728         *count  = object_released;
5729 #else
5730         *count = 0;
5731 #endif
5732         return KERN_SUCCESS;
5733 }
5734
5735
5736
5737 kern_return_t
5738 memory_object_create_named(
5739         memory_object_t pager,
5740         memory_object_offset_t  size,
5741         memory_object_control_t         *control)
5742 {
5743         vm_object_t             object;
5744         vm_object_hash_entry_t  entry;
5745         lck_mtx_t               *lck;
5746
5747         *control = MEMORY_OBJECT_CONTROL_NULL;
5748         if (pager == MEMORY_OBJECT_NULL)
5749                 return KERN_INVALID_ARGUMENT;
5750
5751         lck = vm_object_hash_lock_spin(pager);
5752         entry = vm_object_hash_lookup(pager, FALSE);
5753
5754         if ((entry != VM_OBJECT_HASH_ENTRY_NULL) &&
5755                         (entry->object != VM_OBJECT_NULL)) {
5756                 if (entry->object->named == TRUE)
5757                         panic("memory_object_create_named: caller already holds the right");    }
5758         vm_object_hash_unlock(lck);
5759
5760         if ((object = vm_object_enter(pager, size, FALSE, FALSE, TRUE)) == VM_OBJECT_NULL) {
5761                 return(KERN_INVALID_OBJECT);
5762         }
5763
5764         /* wait for object (if any) to be ready */
5765         if (object != VM_OBJECT_NULL) {
5766                 vm_object_lock(object);
5767                 object->named = TRUE;
5768                 while (!object->pager_ready) {
5769                         vm_object_sleep(object,
5770                                         VM_OBJECT_EVENT_PAGER_READY,
5771                                         THREAD_UNINT);
5772                 }
5773                 *control = object->pager_control;
5774                 vm_object_unlock(object);
5775         }
5776         return (KERN_SUCCESS);
5777 }
5778
5779
5780 /*
5781  *      Routine:        memory_object_recover_named [user interface]
5782  *      Purpose:
5783  *              Attempt to recover a named reference for a VM object.
5784  *              VM will verify that the object has not already started
5785  *              down the termination path, and if it has, will optionally
5786  *              wait for that to finish.
5787  *      Returns:
5788  *              KERN_SUCCESS - we recovered a named reference on the object
5789  *              KERN_FAILURE - we could not recover a reference (object dead)
5790  *              KERN_INVALID_ARGUMENT - bad memory object control
5791  */
5792 kern_return_t
5793 memory_object_recover_named(
5794         memory_object_control_t control,
5795         boolean_t               wait_on_terminating)
5796 {
5797         vm_object_t             object;
5798
5799         object = memory_object_control_to_vm_object(control);
5800         if (object == VM_OBJECT_NULL) {
5801                 return (KERN_INVALID_ARGUMENT);
5802         }
5803 restart:
5804         vm_object_lock(object);
5805
5806         if (object->terminating && wait_on_terminating) {
5807                 vm_object_wait(object,
5808                         VM_OBJECT_EVENT_PAGING_IN_PROGRESS,
5809                         THREAD_UNINT);
5810                 goto restart;
5811         }
5812
5813         if (!object->alive) {
5814                 vm_object_unlock(object);
5815                 return KERN_FAILURE;
5816         }
5817
5818         if (object->named == TRUE) {
5819                 vm_object_unlock(object);
5820                 return KERN_SUCCESS;
5821         }
5822 #if VM_OBJECT_CACHE
5823         if ((object->ref_count == 0) && (!object->terminating)) {
5824                 if (!vm_object_cache_lock_try()) {
5825                         vm_object_unlock(object);
5826                         goto restart;
5827                 }
5828                 queue_remove(&vm_object_cached_list, object,
5829                                      vm_object_t, cached_list);
5830                 vm_object_cached_count--;
5831                 XPR(XPR_VM_OBJECT_CACHE,
5832                     "memory_object_recover_named: removing %X, head (%X, %X)\n",
5833                     object,
5834                     vm_object_cached_list.next,
5835                     vm_object_cached_list.prev, 0,0);
5836
5837                 vm_object_cache_unlock();
5838         }
5839 #endif
5840         object->named = TRUE;
5841         vm_object_lock_assert_exclusive(object);
5842         object->ref_count++;
5843         vm_object_res_reference(object);
5844         while (!object->pager_ready) {
5845                 vm_object_sleep(object,
5846                                 VM_OBJECT_EVENT_PAGER_READY,
5847                                 THREAD_UNINT);
5848         }
5849         vm_object_unlock(object);
5850         return (KERN_SUCCESS);
5851 }
5852
5853
5854 /*
5855  *      vm_object_release_name:
5856  *
5857  *      Enforces name semantic on memory_object reference count decrement
5858  *      This routine should not be called unless the caller holds a name
5859  *      reference gained through the memory_object_create_named.
5860  *
5861  *      If the TERMINATE_IDLE flag is set, the call will return if the
5862  *      reference count is not 1. i.e. idle with the only remaining reference
5863  *      being the name.
5864  *      If the decision is made to proceed the name field flag is set to
5865  *      false and the reference count is decremented.  If the RESPECT_CACHE
5866  *      flag is set and the reference count has gone to zero, the
5867  *      memory_object is checked to see if it is cacheable otherwise when
5868  *      the reference count is zero, it is simply terminated.
5869  */
5870
5871 __private_extern__ kern_return_t
5872 vm_object_release_name(
5873         vm_object_t     object,
5874         int             flags)
5875 {
5876         vm_object_t     shadow;
5877         boolean_t       original_object = TRUE;
5878
5879         while (object != VM_OBJECT_NULL) {
5880
5881                 vm_object_lock(object);
5882
5883                 assert(object->alive);
5884                 if (original_object)
5885                         assert(object->named);
5886                 assert(object->ref_count > 0);
5887
5888                 /*
5889                  *      We have to wait for initialization before
5890                  *      destroying or caching the object.
5891                  */
5892
5893                 if (object->pager_created && !object->pager_initialized) {
5894                         assert(!object->can_persist);
5895                         vm_object_assert_wait(object,
5896                                         VM_OBJECT_EVENT_INITIALIZED,
5897                                         THREAD_UNINT);
5898                         vm_object_unlock(object);
5899                         thread_block(THREAD_CONTINUE_NULL);
5900                         continue;
5901                 }
5902
5903                 if (((object->ref_count > 1)
5904                         && (flags & MEMORY_OBJECT_TERMINATE_IDLE))
5905                         || (object->terminating)) {
5906                         vm_object_unlock(object);
5907                         return KERN_FAILURE;
5908                 } else {
5909                         if (flags & MEMORY_OBJECT_RELEASE_NO_OP) {
5910                                 vm_object_unlock(object);
5911                                 return KERN_SUCCESS;
5912                         }
5913                 }
5914
5915                 if ((flags & MEMORY_OBJECT_RESPECT_CACHE) &&
5916                                         (object->ref_count == 1)) {
5917                         if (original_object)
5918                                 object->named = FALSE;
5919                         vm_object_unlock(object);
5920                         /* let vm_object_deallocate push this thing into */
5921                         /* the cache, if that it is where it is bound */
5922                         vm_object_deallocate(object);
5923                         return KERN_SUCCESS;
5924                 }
5925                 VM_OBJ_RES_DECR(object);
5926                 shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
5927
5928                 if (object->ref_count == 1) {
5929                         if (vm_object_terminate(object) != KERN_SUCCESS) {
5930                                 if (original_object) {
5931                                         return KERN_FAILURE;
5932                                 } else {
5933                                         return KERN_SUCCESS;
5934                                 }
5935                         }
5936                         if (shadow != VM_OBJECT_NULL) {
5937                                 original_object = FALSE;
5938                                 object = shadow;
5939                                 continue;
5940                         }
5941                         return KERN_SUCCESS;
5942                 } else {
5943                         vm_object_lock_assert_exclusive(object);
5944                         object->ref_count--;
5945                         assert(object->ref_count > 0);
5946                         if(original_object)
5947                                 object->named = FALSE;
5948                         vm_object_unlock(object);
5949                         return KERN_SUCCESS;
5950                 }
5951         }
5952         /*NOTREACHED*/
5953         assert(0);
5954         return KERN_FAILURE;
5955 }
5956
5957
5958 __private_extern__ kern_return_t
5959 vm_object_lock_request(
5960         vm_object_t                     object,
5961         vm_object_offset_t              offset,
5962         vm_object_size_t                size,
5963         memory_object_return_t          should_return,
5964         int                             flags,
5965         vm_prot_t                       prot)
5966 {
5967         __unused boolean_t      should_flush;
5968
5969         should_flush = flags & MEMORY_OBJECT_DATA_FLUSH;
5970
5971         XPR(XPR_MEMORY_OBJECT,
5972             "vm_o_lock_request, obj 0x%X off 0x%X size 0x%X flags %X prot %X\n",
5973             object, offset, size,
5974             (((should_return&1)<<1)|should_flush), prot);
5975
5976         /*
5977          *      Check for bogus arguments.
5978          */
5979         if (object == VM_OBJECT_NULL)
5980                 return (KERN_INVALID_ARGUMENT);
5981
5982         if ((prot & ~VM_PROT_ALL) != 0 && prot != VM_PROT_NO_CHANGE)
5983                 return (KERN_INVALID_ARGUMENT);
5984
5985         size = round_page_64(size);
5986
5987         /*
5988          *      Lock the object, and acquire a paging reference to
5989          *      prevent the memory_object reference from being released.
5990          */
5991         vm_object_lock(object);
5992         vm_object_paging_begin(object);
5993
5994         (void)vm_object_update(object,
5995                 offset, size, NULL, NULL, should_return, flags, prot);
5996
5997         vm_object_paging_end(object);
5998         vm_object_unlock(object);
5999
6000         return (KERN_SUCCESS);
6001 }
6002
6003 /*
6004  * Empty a purgeable object by grabbing the physical pages assigned to it and
6005  * putting them on the free queue without writing them to backing store, etc.
6006  * When the pages are next touched they will be demand zero-fill pages.  We
6007  * skip pages which are busy, being paged in/out, wired, etc.  We do _not_
6008  * skip referenced/dirty pages, pages on the active queue, etc.  We're more
6009  * than happy to grab these since this is a purgeable object.  We mark the
6010  * object as "empty" after reaping its pages.
6011  *
6012  * On entry the object must be locked and it must be
6013  * purgeable with no delayed copies pending.
6014  */
6015 void
6016 vm_object_purge(vm_object_t object)
6017 {
6018         vm_object_lock_assert_exclusive(object);
6019
6020         if (object->purgable == VM_PURGABLE_DENY)
6021                 return;
6022
6023         assert(object->copy == VM_OBJECT_NULL);
6024         assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6025
6026         if(object->purgable == VM_PURGABLE_VOLATILE) {
6027                 unsigned int delta;
6028                 assert(object->resident_page_count >=
6029                        object->wired_page_count);
6030                 delta = (object->resident_page_count -
6031                          object->wired_page_count);
6032                 if (delta != 0) {
6033                         assert(vm_page_purgeable_count >=
6034                                delta);
6035                         OSAddAtomic(-delta,
6036                                     (SInt32 *)&vm_page_purgeable_count);
6037                 }
6038                 if (object->wired_page_count != 0) {
6039                         assert(vm_page_purgeable_wired_count >=
6040                                object->wired_page_count);
6041                         OSAddAtomic(-object->wired_page_count,
6042                                     (SInt32 *)&vm_page_purgeable_wired_count);
6043                 }
6044         }
6045         object->purgable = VM_PURGABLE_EMPTY;
6046
6047         vm_object_reap_pages(object, REAP_PURGEABLE);
6048 }
6049
6050
6051 /*
6052  * vm_object_purgeable_control() allows the caller to control and investigate the
6053  * state of a purgeable object.  A purgeable object is created via a call to
6054  * vm_allocate() with VM_FLAGS_PURGABLE specified.  A purgeable object will
6055  * never be coalesced with any other object -- even other purgeable objects --
6056  * and will thus always remain a distinct object.  A purgeable object has
6057  * special semantics when its reference count is exactly 1.  If its reference
6058  * count is greater than 1, then a purgeable object will behave like a normal
6059  * object and attempts to use this interface will result in an error return
6060  * of KERN_INVALID_ARGUMENT.
6061  *
6062  * A purgeable object may be put into a "volatile" state which will make the
6063  * object's pages elligable for being reclaimed without paging to backing
6064  * store if the system runs low on memory.  If the pages in a volatile
6065  * purgeable object are reclaimed, the purgeable object is said to have been
6066  * "emptied."  When a purgeable object is emptied the system will reclaim as
6067  * many pages from the object as it can in a convenient manner (pages already
6068  * en route to backing store or busy for other reasons are left as is).  When
6069  * a purgeable object is made volatile, its pages will generally be reclaimed
6070  * before other pages in the application's working set.  This semantic is
6071  * generally used by applications which can recreate the data in the object
6072  * faster than it can be paged in.  One such example might be media assets
6073  * which can be reread from a much faster RAID volume.
6074  *
6075  * A purgeable object may be designated as "non-volatile" which means it will
6076  * behave like all other objects in the system with pages being written to and
6077  * read from backing store as needed to satisfy system memory needs.  If the
6078  * object was emptied before the object was made non-volatile, that fact will
6079  * be returned as the old state of the purgeable object (see
6080  * VM_PURGABLE_SET_STATE below).  In this case, any pages of the object which
6081  * were reclaimed as part of emptying the object will be refaulted in as
6082  * zero-fill on demand.  It is up to the application to note that an object
6083  * was emptied and recreate the objects contents if necessary.  When a
6084  * purgeable object is made non-volatile, its pages will generally not be paged
6085  * out to backing store in the immediate future.  A purgeable object may also
6086  * be manually emptied.
6087  *
6088  * Finally, the current state (non-volatile, volatile, volatile & empty) of a
6089  * volatile purgeable object may be queried at any time.  This information may
6090  * be used as a control input to let the application know when the system is
6091  * experiencing memory pressure and is reclaiming memory.
6092  *
6093  * The specified address may be any address within the purgeable object.  If
6094  * the specified address does not represent any object in the target task's
6095  * virtual address space, then KERN_INVALID_ADDRESS will be returned.  If the
6096  * object containing the specified address is not a purgeable object, then
6097  * KERN_INVALID_ARGUMENT will be returned.  Otherwise, KERN_SUCCESS will be
6098  * returned.
6099  *
6100  * The control parameter may be any one of VM_PURGABLE_SET_STATE or
6101  * VM_PURGABLE_GET_STATE.  For VM_PURGABLE_SET_STATE, the in/out parameter
6102  * state is used to set the new state of the purgeable object and return its
6103  * old state.  For VM_PURGABLE_GET_STATE, the current state of the purgeable
6104  * object is returned in the parameter state.
6105  *
6106  * The in/out parameter state may be one of VM_PURGABLE_NONVOLATILE,
6107  * VM_PURGABLE_VOLATILE or VM_PURGABLE_EMPTY.  These, respectively, represent
6108  * the non-volatile, volatile and volatile/empty states described above.
6109  * Setting the state of a purgeable object to VM_PURGABLE_EMPTY will
6110  * immediately reclaim as many pages in the object as can be conveniently
6111  * collected (some may have already been written to backing store or be
6112  * otherwise busy).
6113  *
6114  * The process of making a purgeable object non-volatile and determining its
6115  * previous state is atomic.  Thus, if a purgeable object is made
6116  * VM_PURGABLE_NONVOLATILE and the old state is returned as
6117  * VM_PURGABLE_VOLATILE, then the purgeable object's previous contents are
6118  * completely intact and will remain so until the object is made volatile
6119  * again.  If the old state is returned as VM_PURGABLE_EMPTY then the object
6120  * was reclaimed while it was in a volatile state and its previous contents
6121  * have been lost.
6122  */
6123 /*
6124  * The object must be locked.
6125  */
6126 kern_return_t
6127 vm_object_purgable_control(
6128         vm_object_t     object,
6129         vm_purgable_t   control,
6130         int             *state)
6131 {
6132         int             old_state;
6133         int             new_state;
6134
6135         if (object == VM_OBJECT_NULL) {
6136                 /*
6137                  * Object must already be present or it can't be purgeable.
6138                  */
6139                 return KERN_INVALID_ARGUMENT;
6140         }
6141
6142         /*
6143          * Get current state of the purgeable object.
6144          */
6145         old_state = object->purgable;
6146         if (old_state == VM_PURGABLE_DENY)
6147                 return KERN_INVALID_ARGUMENT;
6148
6149         /* purgeable cant have delayed copies - now or in the future */
6150         assert(object->copy == VM_OBJECT_NULL);
6151         assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6152
6153         /*
6154          * Execute the desired operation.
6155          */
6156         if (control == VM_PURGABLE_GET_STATE) {
6157                 *state = old_state;
6158                 return KERN_SUCCESS;
6159         }
6160
6161         if ((*state) & VM_PURGABLE_DEBUG_EMPTY) {
6162                 object->volatile_empty = TRUE;
6163         }
6164         if ((*state) & VM_PURGABLE_DEBUG_FAULT) {
6165                 object->volatile_fault = TRUE;
6166         }
6167
6168         new_state = *state & VM_PURGABLE_STATE_MASK;
6169         if (new_state == VM_PURGABLE_VOLATILE &&
6170             object->volatile_empty) {
6171                 new_state = VM_PURGABLE_EMPTY;
6172         }
6173
6174         switch (new_state) {
6175         case VM_PURGABLE_DENY:
6176         case VM_PURGABLE_NONVOLATILE:
6177                 object->purgable = new_state;
6178
6179                 if (old_state == VM_PURGABLE_VOLATILE) {
6180                         unsigned int delta;
6181
6182                         assert(object->resident_page_count >=
6183                                object->wired_page_count);
6184                         delta = (object->resident_page_count -
6185                                  object->wired_page_count);
6186
6187                         assert(vm_page_purgeable_count >= delta);
6188
6189                         if (delta != 0) {
6190                                 OSAddAtomic(-delta,
6191                                             (SInt32 *)&vm_page_purgeable_count);
6192                         }
6193                         if (object->wired_page_count != 0) {
6194                                 assert(vm_page_purgeable_wired_count >=
6195                                        object->wired_page_count);
6196                                 OSAddAtomic(-object->wired_page_count,
6197                                             (SInt32 *)&vm_page_purgeable_wired_count);
6198                         }
6199
6200                         vm_page_lock_queues();
6201
6202                         assert(object->objq.next != NULL && object->objq.prev != NULL); /* object should be on a queue */
6203                         purgeable_q_t queue = vm_purgeable_object_remove(object);
6204                         assert(queue);
6205
6206                         vm_purgeable_token_delete_first(queue);
6207                         assert(queue->debug_count_objects>=0);
6208
6209                         vm_page_unlock_queues();
6210                 }
6211                 break;
6212
6213         case VM_PURGABLE_VOLATILE:
6214                 if (object->volatile_fault) {
6215                         vm_page_t       p;
6216                         int             refmod;
6217
6218                         queue_iterate(&object->memq, p, vm_page_t, listq) {
6219                                 if (p->busy ||
6220                                     VM_PAGE_WIRED(p) ||
6221                                     p->fictitious) {
6222                                         continue;
6223                                 }
6224                                 refmod = pmap_disconnect(p->phys_page);
6225                                 if ((refmod & VM_MEM_MODIFIED) &&
6226                                     !p->dirty) {
6227                                         p->dirty = TRUE;
6228                                 }
6229                         }
6230                 }
6231
6232                 if (old_state == VM_PURGABLE_EMPTY &&
6233                     object->resident_page_count == 0)
6234                         break;
6235
6236                 purgeable_q_t queue;
6237
6238                 /* find the correct queue */
6239                 if ((*state&VM_PURGABLE_ORDERING_MASK) == VM_PURGABLE_ORDERING_OBSOLETE)
6240                         queue = &purgeable_queues[PURGEABLE_Q_TYPE_OBSOLETE];
6241                 else {
6242                         if ((*state&VM_PURGABLE_BEHAVIOR_MASK) == VM_PURGABLE_BEHAVIOR_FIFO)
6243                                 queue = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO];
6244                         else
6245                                 queue = &purgeable_queues[PURGEABLE_Q_TYPE_LIFO];
6246                 }
6247
6248                 if (old_state == VM_PURGABLE_NONVOLATILE ||
6249                     old_state == VM_PURGABLE_EMPTY) {
6250                         unsigned int delta;
6251
6252                         /* try to add token... this can fail */
6253                         vm_page_lock_queues();
6254
6255                         kern_return_t result = vm_purgeable_token_add(queue);
6256                         if (result != KERN_SUCCESS) {
6257                                 vm_page_unlock_queues();
6258                                 return result;
6259                         }
6260                         vm_page_unlock_queues();
6261
6262                         assert(object->resident_page_count >=
6263                                object->wired_page_count);
6264                         delta = (object->resident_page_count -
6265                                  object->wired_page_count);
6266
6267                         if (delta != 0) {
6268                                 OSAddAtomic(delta,
6269                                             &vm_page_purgeable_count);
6270                         }
6271                         if (object->wired_page_count != 0) {
6272                                 OSAddAtomic(object->wired_page_count,
6273                                             &vm_page_purgeable_wired_count);
6274                         }
6275
6276                         object->purgable = new_state;
6277
6278                         /* object should not be on a queue */
6279                         assert(object->objq.next == NULL && object->objq.prev == NULL);
6280                 }
6281                 else if (old_state == VM_PURGABLE_VOLATILE) {
6282                         /*
6283                          * if reassigning priorities / purgeable groups, we don't change the
6284                          * token queue. So moving priorities will not make pages stay around longer.
6285                          * Reasoning is that the algorithm gives most priority to the most important
6286                          * object. If a new token is added, the most important object' priority is boosted.
6287                          * This biases the system already for purgeable queues that move a lot.
6288                          * It doesn't seem more biasing is neccessary in this case, where no new object is added.
6289                          */
6290                         assert(object->objq.next != NULL && object->objq.prev != NULL); /* object should be on a queue */
6291
6292                         purgeable_q_t old_queue=vm_purgeable_object_remove(object);
6293                         assert(old_queue);
6294
6295                         if (old_queue != queue) {
6296                                 kern_return_t result;
6297
6298                                 /* Changing queue. Have to move token. */
6299                                 vm_page_lock_queues();
6300                                 vm_purgeable_token_delete_first(old_queue);
6301                                 result = vm_purgeable_token_add(queue);
6302                                 vm_page_unlock_queues();
6303
6304                                 assert(result==KERN_SUCCESS);   /* this should never fail since we just freed a token */
6305                         }
6306                 };
6307                 vm_purgeable_object_add(object, queue, (*state&VM_VOLATILE_GROUP_MASK)>>VM_VOLATILE_GROUP_SHIFT );
6308
6309                 assert(queue->debug_count_objects>=0);
6310
6311                 break;
6312
6313
6314         case VM_PURGABLE_EMPTY:
6315                 if (object->volatile_fault) {
6316                         vm_page_t       p;
6317                         int             refmod;
6318
6319                         queue_iterate(&object->memq, p, vm_page_t, listq) {
6320                                 if (p->busy ||
6321                                     VM_PAGE_WIRED(p) ||
6322                                     p->fictitious) {
6323                                         continue;
6324                                 }
6325                                 refmod = pmap_disconnect(p->phys_page);
6326                                 if ((refmod & VM_MEM_MODIFIED) &&
6327                                     !p->dirty) {
6328                                         p->dirty = TRUE;
6329                                 }
6330                         }
6331                 }
6332
6333                 if (old_state != new_state) {
6334                         assert(old_state == VM_PURGABLE_NONVOLATILE ||
6335                                old_state == VM_PURGABLE_VOLATILE);
6336                         if (old_state == VM_PURGABLE_VOLATILE) {
6337                                 purgeable_q_t old_queue;
6338
6339                                 /* object should be on a queue */
6340                                 assert(object->objq.next != NULL &&
6341                                        object->objq.prev != NULL);
6342                                 old_queue = vm_purgeable_object_remove(object);
6343                                 assert(old_queue);
6344                                 vm_page_lock_queues();
6345                                 vm_purgeable_token_delete_first(old_queue);
6346                                 vm_page_unlock_queues();
6347                         }
6348                         (void) vm_object_purge(object);
6349                 }
6350                 break;
6351
6352         }
6353         *state = old_state;
6354
6355         return KERN_SUCCESS;
6356 }
6357
6358 #if     TASK_SWAPPER
6359 /*
6360  * vm_object_res_deallocate
6361  *
6362  * (recursively) decrement residence counts on vm objects and their shadows.
6363  * Called from vm_object_deallocate and when swapping out an object.
6364  *
6365  * The object is locked, and remains locked throughout the function,
6366  * even as we iterate down the shadow chain.  Locks on intermediate objects
6367  * will be dropped, but not the original object.
6368  *
6369  * NOTE: this function used to use recursion, rather than iteration.
6370  */
6371
6372 __private_extern__ void
6373 vm_object_res_deallocate(
6374         vm_object_t     object)
6375 {
6376         vm_object_t orig_object = object;
6377         /*
6378          * Object is locked so it can be called directly
6379          * from vm_object_deallocate.  Original object is never
6380          * unlocked.
6381          */
6382         assert(object->res_count > 0);
6383         while  (--object->res_count == 0) {
6384                 assert(object->ref_count >= object->res_count);
6385                 vm_object_deactivate_all_pages(object);
6386                 /* iterate on shadow, if present */
6387                 if (object->shadow != VM_OBJECT_NULL) {
6388                         vm_object_t tmp_object = object->shadow;
6389                         vm_object_lock(tmp_object);
6390                         if (object != orig_object)
6391                                 vm_object_unlock(object);
6392                         object = tmp_object;
6393                         assert(object->res_count > 0);
6394                 } else
6395                         break;
6396         }
6397         if (object != orig_object)
6398                 vm_object_unlock(object);
6399 }
6400
6401 /*
6402  * vm_object_res_reference
6403  *
6404  * Internal function to increment residence count on a vm object
6405  * and its shadows.  It is called only from vm_object_reference, and
6406  * when swapping in a vm object, via vm_map_swap.
6407  *
6408  * The object is locked, and remains locked throughout the function,
6409  * even as we iterate down the shadow chain.  Locks on intermediate objects
6410  * will be dropped, but not the original object.
6411  *
6412  * NOTE: this function used to use recursion, rather than iteration.
6413  */
6414
6415 __private_extern__ void
6416 vm_object_res_reference(
6417         vm_object_t     object)
6418 {
6419         vm_object_t orig_object = object;
6420         /*
6421          * Object is locked, so this can be called directly
6422          * from vm_object_reference.  This lock is never released.
6423          */
6424         while  ((++object->res_count == 1)  &&
6425                 (object->shadow != VM_OBJECT_NULL)) {
6426                 vm_object_t tmp_object = object->shadow;
6427
6428                 assert(object->ref_count >= object->res_count);
6429                 vm_object_lock(tmp_object);
6430                 if (object != orig_object)
6431                         vm_object_unlock(object);
6432                 object = tmp_object;
6433         }
6434         if (object != orig_object)
6435                 vm_object_unlock(object);
6436         assert(orig_object->ref_count >= orig_object->res_count);
6437 }
6438 #endif  /* TASK_SWAPPER */
6439
6440 /*
6441  *      vm_object_reference:
6442  *
6443  *      Gets another reference to the given object.
6444  */
6445 #ifdef vm_object_reference
6446 #undef vm_object_reference
6447 #endif
6448 __private_extern__ void
6449 vm_object_reference(
6450         register vm_object_t    object)
6451 {
6452         if (object == VM_OBJECT_NULL)
6453                 return;
6454
6455         vm_object_lock(object);
6456         assert(object->ref_count > 0);
6457         vm_object_reference_locked(object);
6458         vm_object_unlock(object);
6459 }
6460
6461 #ifdef MACH_BSD
6462 /*
6463  * Scale the vm_object_cache
6464  * This is required to make sure that the vm_object_cache is big
6465  * enough to effectively cache the mapped file.
6466  * This is really important with UBC as all the regular file vnodes
6467  * have memory object associated with them. Havving this cache too
6468  * small results in rapid reclaim of vnodes and hurts performance a LOT!
6469  *
6470  * This is also needed as number of vnodes can be dynamically scaled.
6471  */
6472 kern_return_t
6473 adjust_vm_object_cache(
6474         __unused vm_size_t oval,
6475         __unused vm_size_t nval)
6476 {
6477 #if VM_OBJECT_CACHE
6478         vm_object_cached_max = nval;
6479         vm_object_cache_trim(FALSE);
6480 #endif
6481         return (KERN_SUCCESS);
6482 }
6483 #endif /* MACH_BSD */
6484
6485
6486 /*
6487  * vm_object_transpose
6488  *
6489  * This routine takes two VM objects of the same size and exchanges
6490  * their backing store.
6491  * The objects should be "quiesced" via a UPL operation with UPL_SET_IO_WIRE
6492  * and UPL_BLOCK_ACCESS if they are referenced anywhere.
6493  *
6494  * The VM objects must not be locked by caller.
6495  */
6496 unsigned int vm_object_transpose_count = 0;
6497 kern_return_t
6498 vm_object_transpose(
6499         vm_object_t             object1,
6500         vm_object_t             object2,
6501         vm_object_size_t        transpose_size)
6502 {
6503         vm_object_t             tmp_object;
6504         kern_return_t           retval;
6505         boolean_t               object1_locked, object2_locked;
6506         vm_page_t               page;
6507         vm_object_offset_t      page_offset;
6508         lck_mtx_t               *hash_lck;
6509         vm_object_hash_entry_t  hash_entry;
6510
6511         tmp_object = VM_OBJECT_NULL;
6512         object1_locked = FALSE; object2_locked = FALSE;
6513
6514         if (object1 == object2 ||
6515             object1 == VM_OBJECT_NULL ||
6516             object2 == VM_OBJECT_NULL) {
6517                 /*
6518                  * If the 2 VM objects are the same, there's
6519                  * no point in exchanging their backing store.
6520                  */
6521                 retval = KERN_INVALID_VALUE;
6522                 goto done;
6523         }
6524
6525         /*
6526          * Since we need to lock both objects at the same time,
6527          * make sure we always lock them in the same order to
6528          * avoid deadlocks.
6529          */
6530         if (object1 >  object2) {
6531                 tmp_object = object1;
6532                 object1 = object2;
6533                 object2 = tmp_object;
6534         }
6535
6536         /*
6537          * Allocate a temporary VM object to hold object1's contents
6538          * while we copy object2 to object1.
6539          */
6540         tmp_object = vm_object_allocate(transpose_size);
6541         vm_object_lock(tmp_object);
6542         tmp_object->can_persist = FALSE;
6543
6544
6545         /*
6546          * Grab control of the 1st VM object.
6547          */
6548         vm_object_lock(object1);
6549         object1_locked = TRUE;
6550         if (!object1->alive || object1->terminating ||
6551             object1->copy || object1->shadow || object1->shadowed ||
6552             object1->purgable != VM_PURGABLE_DENY) {
6553                 /*
6554                  * We don't deal with copy or shadow objects (yet).
6555                  */
6556                 retval = KERN_INVALID_VALUE;
6557                 goto done;
6558         }
6559         /*
6560          * We're about to mess with the object's backing store and
6561          * taking a "paging_in_progress" reference wouldn't be enough
6562          * to prevent any paging activity on this object, so the caller should
6563          * have "quiesced" the objects beforehand, via a UPL operation with
6564          * UPL_SET_IO_WIRE (to make sure all the pages are there and wired)
6565          * and UPL_BLOCK_ACCESS (to mark the pages "busy").
6566          *
6567          * Wait for any paging operation to complete (but only paging, not
6568          * other kind of activities not linked to the pager).  After we're
6569          * statisfied that there's no more paging in progress, we keep the
6570          * object locked, to guarantee that no one tries to access its pager.
6571          */
6572         vm_object_paging_only_wait(object1, THREAD_UNINT);
6573
6574         /*
6575          * Same as above for the 2nd object...
6576          */
6577         vm_object_lock(object2);
6578         object2_locked = TRUE;
6579         if (! object2->alive || object2->terminating ||
6580             object2->copy || object2->shadow || object2->shadowed ||
6581             object2->purgable != VM_PURGABLE_DENY) {
6582                 retval = KERN_INVALID_VALUE;
6583                 goto done;
6584         }
6585         vm_object_paging_only_wait(object2, THREAD_UNINT);
6586
6587
6588         if (object1->size != object2->size ||
6589             object1->size != transpose_size) {
6590                 /*
6591                  * If the 2 objects don't have the same size, we can't
6592                  * exchange their backing stores or one would overflow.
6593                  * If their size doesn't match the caller's
6594                  * "transpose_size", we can't do it either because the
6595                  * transpose operation will affect the entire span of
6596                  * the objects.
6597                  */
6598                 retval = KERN_INVALID_VALUE;
6599                 goto done;
6600         }
6601
6602
6603         /*
6604          * Transpose the lists of resident pages.
6605          * This also updates the resident_page_count and the memq_hint.
6606          */
6607         if (object1->phys_contiguous || queue_empty(&object1->memq)) {
6608                 /*
6609                  * No pages in object1, just transfer pages
6610                  * from object2 to object1.  No need to go through
6611                  * an intermediate object.
6612                  */
6613                 while (!queue_empty(&object2->memq)) {
6614                         page = (vm_page_t) queue_first(&object2->memq);
6615                         vm_page_rename(page, object1, page->offset, FALSE);
6616                 }
6617                 assert(queue_empty(&object2->memq));
6618         } else if (object2->phys_contiguous || queue_empty(&object2->memq)) {
6619                 /*
6620                  * No pages in object2, just transfer pages
6621                  * from object1 to object2.  No need to go through
6622                  * an intermediate object.
6623                  */
6624                 while (!queue_empty(&object1->memq)) {
6625                         page = (vm_page_t) queue_first(&object1->memq);
6626                         vm_page_rename(page, object2, page->offset, FALSE);
6627                 }
6628                 assert(queue_empty(&object1->memq));
6629         } else {
6630                 /* transfer object1's pages to tmp_object */
6631                 while (!queue_empty(&object1->memq)) {
6632                         page = (vm_page_t) queue_first(&object1->memq);
6633                         page_offset = page->offset;
6634                         vm_page_remove(page, TRUE);
6635                         page->offset = page_offset;
6636                         queue_enter(&tmp_object->memq, page, vm_page_t, listq);
6637                 }
6638                 assert(queue_empty(&object1->memq));
6639                 /* transfer object2's pages to object1 */
6640                 while (!queue_empty(&object2->memq)) {
6641                         page = (vm_page_t) queue_first(&object2->memq);
6642                         vm_page_rename(page, object1, page->offset, FALSE);
6643                 }
6644                 assert(queue_empty(&object2->memq));
6645                 /* transfer tmp_object's pages to object1 */
6646                 while (!queue_empty(&tmp_object->memq)) {
6647                         page = (vm_page_t) queue_first(&tmp_object->memq);
6648                         queue_remove(&tmp_object->memq, page,
6649                                      vm_page_t, listq);
6650                         vm_page_insert(page, object2, page->offset);
6651                 }
6652                 assert(queue_empty(&tmp_object->memq));
6653         }
6654
6655 #define __TRANSPOSE_FIELD(field)                                \
6656 MACRO_BEGIN                                                     \
6657         tmp_object->field = object1->field;                     \
6658         object1->field = object2->field;                        \
6659         object2->field = tmp_object->field;                     \
6660 MACRO_END
6661
6662         /* "Lock" refers to the object not its contents */
6663         /* "size" should be identical */
6664         assert(object1->size == object2->size);
6665         /* "memq_hint" was updated above when transposing pages */
6666         /* "ref_count" refers to the object not its contents */
6667 #if TASK_SWAPPER
6668         /* "res_count" refers to the object not its contents */
6669 #endif
6670         /* "resident_page_count" was updated above when transposing pages */
6671         /* "wired_page_count" was updated above when transposing pages */
6672         /* "reusable_page_count" was updated above when transposing pages */
6673         /* there should be no "copy" */
6674         assert(!object1->copy);
6675         assert(!object2->copy);
6676         /* there should be no "shadow" */
6677         assert(!object1->shadow);
6678         assert(!object2->shadow);
6679         __TRANSPOSE_FIELD(shadow_offset); /* used by phys_contiguous objects */
6680         __TRANSPOSE_FIELD(pager);
6681         __TRANSPOSE_FIELD(paging_offset);
6682         __TRANSPOSE_FIELD(pager_control);
6683         /* update the memory_objects' pointers back to the VM objects */
6684         if (object1->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
6685                 memory_object_control_collapse(object1->pager_control,
6686                                                object1);
6687         }
6688         if (object2->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
6689                 memory_object_control_collapse(object2->pager_control,
6690                                                object2);
6691         }
6692         __TRANSPOSE_FIELD(copy_strategy);
6693         /* "paging_in_progress" refers to the object not its contents */
6694         assert(!object1->paging_in_progress);
6695         assert(!object2->paging_in_progress);
6696         assert(object1->activity_in_progress);
6697         assert(object2->activity_in_progress);
6698         /* "all_wanted" refers to the object not its contents */
6699         __TRANSPOSE_FIELD(pager_created);
6700         __TRANSPOSE_FIELD(pager_initialized);
6701         __TRANSPOSE_FIELD(pager_ready);
6702         __TRANSPOSE_FIELD(pager_trusted);
6703         __TRANSPOSE_FIELD(can_persist);
6704         __TRANSPOSE_FIELD(internal);
6705         __TRANSPOSE_FIELD(temporary);
6706         __TRANSPOSE_FIELD(private);
6707         __TRANSPOSE_FIELD(pageout);
6708         /* "alive" should be set */
6709         assert(object1->alive);
6710         assert(object2->alive);
6711         /* "purgeable" should be non-purgeable */
6712         assert(object1->purgable == VM_PURGABLE_DENY);
6713         assert(object2->purgable == VM_PURGABLE_DENY);
6714         /* "shadowed" refers to the the object not its contents */
6715         __TRANSPOSE_FIELD(silent_overwrite);
6716         __TRANSPOSE_FIELD(advisory_pageout);
6717         __TRANSPOSE_FIELD(true_share);
6718         /* "terminating" should not be set */
6719         assert(!object1->terminating);
6720         assert(!object2->terminating);
6721         __TRANSPOSE_FIELD(named);
6722         /* "shadow_severed" refers to the object not its contents */
6723         __TRANSPOSE_FIELD(phys_contiguous);
6724         __TRANSPOSE_FIELD(nophyscache);
6725         /* "cached_list.next" points to transposed object */
6726         object1->cached_list.next = (queue_entry_t) object2;
6727         object2->cached_list.next = (queue_entry_t) object1;
6728         /* "cached_list.prev" should be NULL */
6729         assert(object1->cached_list.prev == NULL);
6730         assert(object2->cached_list.prev == NULL);
6731         /* "msr_q" is linked to the object not its contents */
6732         assert(queue_empty(&object1->msr_q));
6733         assert(queue_empty(&object2->msr_q));
6734         __TRANSPOSE_FIELD(last_alloc);
6735         __TRANSPOSE_FIELD(sequential);
6736         __TRANSPOSE_FIELD(pages_created);
6737         __TRANSPOSE_FIELD(pages_used);
6738 #if MACH_PAGEMAP
6739         __TRANSPOSE_FIELD(existence_map);
6740 #endif
6741         __TRANSPOSE_FIELD(cow_hint);
6742 #if MACH_ASSERT
6743         __TRANSPOSE_FIELD(paging_object);
6744 #endif
6745         __TRANSPOSE_FIELD(wimg_bits);
6746         __TRANSPOSE_FIELD(code_signed);
6747         if (object1->hashed) {
6748                 hash_lck = vm_object_hash_lock_spin(object2->pager);
6749                 hash_entry = vm_object_hash_lookup(object2->pager, FALSE);
6750                 assert(hash_entry != VM_OBJECT_HASH_ENTRY_NULL);
6751                 hash_entry->object = object2;
6752                 vm_object_hash_unlock(hash_lck);
6753         }
6754         if (object2->hashed) {
6755                 hash_lck = vm_object_hash_lock_spin(object1->pager);
6756                 hash_entry = vm_object_hash_lookup(object1->pager, FALSE);
6757                 assert(hash_entry != VM_OBJECT_HASH_ENTRY_NULL);
6758                 hash_entry->object = object1;
6759                 vm_object_hash_unlock(hash_lck);
6760         }
6761         __TRANSPOSE_FIELD(hashed);
6762         object1->transposed = TRUE;
6763         object2->transposed = TRUE;
6764         __TRANSPOSE_FIELD(mapping_in_progress);
6765         __TRANSPOSE_FIELD(volatile_empty);
6766         __TRANSPOSE_FIELD(volatile_fault);
6767         __TRANSPOSE_FIELD(all_reusable);
6768         assert(object1->blocked_access);
6769         assert(object2->blocked_access);
6770         assert(object1->__object2_unused_bits == 0);
6771         assert(object2->__object2_unused_bits == 0);
6772 #if UPL_DEBUG
6773         /* "uplq" refers to the object not its contents (see upl_transpose()) */
6774 #endif
6775         assert(object1->objq.next == NULL);
6776         assert(object1->objq.prev == NULL);
6777         assert(object2->objq.next == NULL);
6778         assert(object2->objq.prev == NULL);
6779
6780 #undef __TRANSPOSE_FIELD
6781
6782         retval = KERN_SUCCESS;
6783
6784 done:
6785         /*
6786          * Cleanup.
6787          */
6788         if (tmp_object != VM_OBJECT_NULL) {
6789                 vm_object_unlock(tmp_object);
6790                 /*
6791                  * Re-initialize the temporary object to avoid
6792                  * deallocating a real pager.
6793                  */
6794                 _vm_object_allocate(transpose_size, tmp_object);
6795                 vm_object_deallocate(tmp_object);
6796                 tmp_object = VM_OBJECT_NULL;
6797         }
6798
6799         if (object1_locked) {
6800                 vm_object_unlock(object1);
6801                 object1_locked = FALSE;
6802         }
6803         if (object2_locked) {
6804                 vm_object_unlock(object2);
6805                 object2_locked = FALSE;
6806         }
6807
6808         vm_object_transpose_count++;
6809
6810         return retval;
6811 }
6812
6813
6814 /*
6815  *      vm_object_cluster_size
6816  *
6817  *      Determine how big a cluster we should issue an I/O for...
6818  *
6819  *      Inputs:   *start == offset of page needed
6820  *                *length == maximum cluster pager can handle
6821  *      Outputs:  *start == beginning offset of cluster
6822  *                *length == length of cluster to try
6823  *
6824  *      The original *start will be encompassed by the cluster
6825  *
6826  */
6827 extern int speculative_reads_disabled;
6828 #if CONFIG_EMBEDDED
6829 unsigned int preheat_pages_max = MAX_UPL_TRANSFER;
6830 unsigned int preheat_pages_min = 8;
6831 unsigned int preheat_pages_mult = 4;
6832 #else
6833 unsigned int preheat_pages_max = MAX_UPL_TRANSFER;
6834 unsigned int preheat_pages_min = 8;
6835 unsigned int preheat_pages_mult = 4;
6836 #endif
6837
6838 uint32_t pre_heat_scaling[MAX_UPL_TRANSFER + 1];
6839 uint32_t pre_heat_cluster[MAX_UPL_TRANSFER + 1];
6840
6841
6842 __private_extern__ void
6843 vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start,
6844                        vm_size_t *length, vm_object_fault_info_t fault_info, uint32_t *io_streaming)
6845 {
6846         vm_size_t               pre_heat_size;
6847         vm_size_t               tail_size;
6848         vm_size_t               head_size;
6849         vm_size_t               max_length;
6850         vm_size_t               cluster_size;
6851         vm_object_offset_t      object_size;
6852         vm_object_offset_t      orig_start;
6853         vm_object_offset_t      target_start;
6854         vm_object_offset_t      offset;
6855         vm_behavior_t           behavior;
6856         boolean_t               look_behind = TRUE;
6857         boolean_t               look_ahead  = TRUE;
6858         uint32_t                throttle_limit;
6859         int                     sequential_run;
6860         int                     sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
6861         unsigned int            max_ph_size;
6862         unsigned int            min_ph_size;
6863         unsigned int            ph_mult;
6864
6865         assert( !(*length & PAGE_MASK));
6866         assert( !(*start & PAGE_MASK_64));
6867
6868         if ( (ph_mult = preheat_pages_mult) < 1 )
6869                 ph_mult = 1;
6870         if ( (min_ph_size = preheat_pages_min) < 1 )
6871                 min_ph_size = 1;
6872         if ( (max_ph_size = preheat_pages_max) > MAX_UPL_TRANSFER )
6873                 max_ph_size = MAX_UPL_TRANSFER;
6874
6875         if ( (max_length = *length) > (max_ph_size * PAGE_SIZE) )
6876                 max_length = (max_ph_size * PAGE_SIZE);
6877
6878         /*
6879          * we'll always return a cluster size of at least
6880          * 1 page, since the original fault must always
6881          * be processed
6882          */
6883         *length = PAGE_SIZE;
6884         *io_streaming = 0;
6885
6886         if (speculative_reads_disabled || fault_info == NULL || max_length == 0) {
6887                 /*
6888                  * no cluster... just fault the page in
6889                  */
6890                 return;
6891         }
6892         orig_start = *start;
6893         target_start = orig_start;
6894         cluster_size = round_page(fault_info->cluster_size);
6895         behavior = fault_info->behavior;
6896
6897         vm_object_lock(object);
6898
6899         if (object->internal)
6900                 object_size = object->size;
6901         else if (object->pager != MEMORY_OBJECT_NULL)
6902                 vnode_pager_get_object_size(object->pager, &object_size);
6903         else
6904                 goto out;       /* pager is gone for this object, nothing more to do */
6905
6906         object_size = round_page_64(object_size);
6907
6908         if (orig_start >= object_size) {
6909                 /*
6910                  * fault occurred beyond the EOF...
6911                  * we need to punt w/o changing the
6912                  * starting offset
6913                  */
6914                 goto out;
6915         }
6916         if (object->pages_used > object->pages_created) {
6917                 /*
6918                  * must have wrapped our 32 bit counters
6919                  * so reset
6920                  */
6921                 object->pages_used = object->pages_created = 0;
6922         }
6923         if ((sequential_run = object->sequential)) {
6924                   if (sequential_run < 0) {
6925                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
6926                           sequential_run = 0 - sequential_run;
6927                   } else {
6928                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
6929                   }
6930
6931         }
6932         switch(behavior) {
6933
6934         default:
6935                 behavior = VM_BEHAVIOR_DEFAULT;
6936
6937         case VM_BEHAVIOR_DEFAULT:
6938                 if (object->internal && fault_info->user_tag == VM_MEMORY_STACK)
6939                         goto out;
6940
6941                 if (sequential_run >= (3 * PAGE_SIZE)) {
6942                         pre_heat_size = sequential_run + PAGE_SIZE;
6943
6944                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL)
6945                                 look_behind = FALSE;
6946                         else
6947                                 look_ahead = FALSE;
6948
6949                         *io_streaming = 1;
6950                 } else {
6951
6952                         if (object->pages_created < 32 * ph_mult) {
6953                                 /*
6954                                  * prime the pump
6955                                  */
6956                                 pre_heat_size = PAGE_SIZE * 8 * ph_mult;
6957                                 break;
6958                         }
6959                         /*
6960                          * Linear growth in PH size: The maximum size is max_length...
6961                          * this cacluation will result in a size that is neither a
6962                          * power of 2 nor a multiple of PAGE_SIZE... so round
6963                          * it up to the nearest PAGE_SIZE boundary
6964                          */
6965                         pre_heat_size = (ph_mult * (max_length * object->pages_used) / object->pages_created);
6966
6967                         if (pre_heat_size < PAGE_SIZE * min_ph_size)
6968                                 pre_heat_size = PAGE_SIZE * min_ph_size;
6969                         else
6970                                 pre_heat_size = round_page(pre_heat_size);
6971                 }
6972                 break;
6973
6974         case VM_BEHAVIOR_RANDOM:
6975                 if ((pre_heat_size = cluster_size) <= PAGE_SIZE)
6976                         goto out;
6977                 break;
6978
6979         case VM_BEHAVIOR_SEQUENTIAL:
6980                 if ((pre_heat_size = cluster_size) == 0)
6981                         pre_heat_size = sequential_run + PAGE_SIZE;
6982                 look_behind = FALSE;
6983                 *io_streaming = 1;
6984
6985                 break;
6986
6987         case VM_BEHAVIOR_RSEQNTL:
6988                 if ((pre_heat_size = cluster_size) == 0)
6989                         pre_heat_size = sequential_run + PAGE_SIZE;
6990                 look_ahead = FALSE;
6991                 *io_streaming = 1;
6992
6993                 break;
6994
6995         }
6996         throttle_limit = (uint32_t) max_length;
6997         assert(throttle_limit == max_length);
6998
6999         if (vnode_pager_check_hard_throttle(object->pager, &throttle_limit, *io_streaming) == KERN_SUCCESS) {
7000                 if (max_length > throttle_limit)
7001                         max_length = throttle_limit;
7002         }
7003         if (pre_heat_size > max_length)
7004                 pre_heat_size = max_length;
7005
7006         if (behavior == VM_BEHAVIOR_DEFAULT) {
7007                 if (vm_page_free_count < vm_page_throttle_limit)
7008                         pre_heat_size = trunc_page(pre_heat_size / 8);
7009                 else if (vm_page_free_count < vm_page_free_target)
7010                         pre_heat_size = trunc_page(pre_heat_size / 2);
7011
7012                 if (pre_heat_size <= PAGE_SIZE)
7013                         goto out;
7014         }
7015         if (look_ahead == TRUE) {
7016                 if (look_behind == TRUE) {
7017                         /*
7018                          * if we get here its due to a random access...
7019                          * so we want to center the original fault address
7020                          * within the cluster we will issue... make sure
7021                          * to calculate 'head_size' as a multiple of PAGE_SIZE...
7022                          * 'pre_heat_size' is a multiple of PAGE_SIZE but not
7023                          * necessarily an even number of pages so we need to truncate
7024                          * the result to a PAGE_SIZE boundary
7025                          */
7026                         head_size = trunc_page(pre_heat_size / 2);
7027
7028                         if (target_start > head_size)
7029                                 target_start -= head_size;
7030                         else
7031                                 target_start = 0;
7032
7033                         /*
7034                          * 'target_start' at this point represents the beginning offset
7035                          * of the cluster we are considering... 'orig_start' will be in
7036                          * the center of this cluster if we didn't have to clip the start
7037                          * due to running into the start of the file
7038                          */
7039                 }
7040                 if ((target_start + pre_heat_size) > object_size)
7041                         pre_heat_size = (vm_size_t)(round_page_64(object_size - target_start));
7042                 /*
7043                  * at this point caclulate the number of pages beyond the original fault
7044                  * address that we want to consider... this is guaranteed not to extend beyond
7045                  * the current EOF...
7046                  */
7047                 assert((vm_size_t)(orig_start - target_start) == (orig_start - target_start));
7048                 tail_size = pre_heat_size - (vm_size_t)(orig_start - target_start) - PAGE_SIZE;
7049         } else {
7050                 if (pre_heat_size > target_start)
7051                         pre_heat_size = (vm_size_t) target_start; /* XXX: 32-bit vs 64-bit ? Joe ? */
7052                 tail_size = 0;
7053         }
7054         assert( !(target_start & PAGE_MASK_64));
7055         assert( !(pre_heat_size & PAGE_MASK));
7056
7057         pre_heat_scaling[pre_heat_size / PAGE_SIZE]++;
7058
7059         if (pre_heat_size <= PAGE_SIZE)
7060                 goto out;
7061
7062         if (look_behind == TRUE) {
7063                 /*
7064                  * take a look at the pages before the original
7065                  * faulting offset... recalculate this in case
7066                  * we had to clip 'pre_heat_size' above to keep
7067                  * from running past the EOF.
7068                  */
7069                 head_size = pre_heat_size - tail_size - PAGE_SIZE;
7070
7071                 for (offset = orig_start - PAGE_SIZE_64; head_size; offset -= PAGE_SIZE_64, head_size -= PAGE_SIZE) {
7072                         /*
7073                          * don't poke below the lowest offset
7074                          */
7075                         if (offset < fault_info->lo_offset)
7076                                 break;
7077                         /*
7078                          * for external objects and internal objects w/o an existence map
7079                          * vm_externl_state_get will return VM_EXTERNAL_STATE_UNKNOWN
7080                          */
7081 #if MACH_PAGEMAP
7082                         if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_ABSENT) {
7083                                 /*
7084                                  * we know for a fact that the pager can't provide the page
7085                                  * so don't include it or any pages beyond it in this cluster
7086                                  */
7087                                 break;
7088                         }
7089 #endif
7090                         if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
7091                                 /*
7092                                  * don't bridge resident pages
7093                                  */
7094                                 break;
7095                         }
7096                         *start = offset;
7097                         *length += PAGE_SIZE;
7098                 }
7099         }
7100         if (look_ahead == TRUE) {
7101                 for (offset = orig_start + PAGE_SIZE_64; tail_size; offset += PAGE_SIZE_64, tail_size -= PAGE_SIZE) {
7102                         /*
7103                          * don't poke above the highest offset
7104                          */
7105                         if (offset >= fault_info->hi_offset)
7106                                 break;
7107                         assert(offset < object_size);
7108
7109                         /*
7110                          * for external objects and internal objects w/o an existence map
7111                          * vm_externl_state_get will return VM_EXTERNAL_STATE_UNKNOWN
7112                          */
7113 #if MACH_PAGEMAP
7114                         if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_ABSENT) {
7115                                 /*
7116                                  * we know for a fact that the pager can't provide the page
7117                                  * so don't include it or any pages beyond it in this cluster
7118                                  */
7119                                 break;
7120                         }
7121 #endif
7122                         if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
7123                                 /*
7124                                  * don't bridge resident pages
7125                                  */
7126                                 break;
7127                         }
7128                         *length += PAGE_SIZE;
7129                 }
7130         }
7131 out:
7132         if (*length > max_length)
7133                 *length = max_length;
7134
7135         pre_heat_cluster[*length / PAGE_SIZE]++;
7136
7137         vm_object_unlock(object);
7138 }
7139
7140
7141 /*
7142  * Allow manipulation of individual page state.  This is actually part of
7143  * the UPL regimen but takes place on the VM object rather than on a UPL
7144  */
7145
7146 kern_return_t
7147 vm_object_page_op(
7148         vm_object_t             object,
7149         vm_object_offset_t      offset,
7150         int                     ops,
7151         ppnum_t                 *phys_entry,
7152         int                     *flags)
7153 {
7154         vm_page_t               dst_page;
7155
7156         vm_object_lock(object);
7157
7158         if(ops & UPL_POP_PHYSICAL) {
7159                 if(object->phys_contiguous) {
7160                         if (phys_entry) {
7161                                 *phys_entry = (ppnum_t)
7162                                         (object->shadow_offset >> PAGE_SHIFT);
7163                         }
7164                         vm_object_unlock(object);
7165                         return KERN_SUCCESS;
7166                 } else {
7167                         vm_object_unlock(object);
7168                         return KERN_INVALID_OBJECT;
7169                 }
7170         }
7171         if(object->phys_contiguous) {
7172                 vm_object_unlock(object);
7173                 return KERN_INVALID_OBJECT;
7174         }
7175
7176         while(TRUE) {
7177                 if((dst_page = vm_page_lookup(object,offset)) == VM_PAGE_NULL) {
7178                         vm_object_unlock(object);
7179                         return KERN_FAILURE;
7180                 }
7181
7182                 /* Sync up on getting the busy bit */
7183                 if((dst_page->busy || dst_page->cleaning) &&
7184                            (((ops & UPL_POP_SET) &&
7185                            (ops & UPL_POP_BUSY)) || (ops & UPL_POP_DUMP))) {
7186                         /* someone else is playing with the page, we will */
7187                         /* have to wait */
7188                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
7189                         continue;
7190                 }
7191
7192                 if (ops & UPL_POP_DUMP) {
7193                         if (dst_page->pmapped == TRUE)
7194                                 pmap_disconnect(dst_page->phys_page);
7195
7196                         VM_PAGE_FREE(dst_page);
7197                         break;
7198                 }
7199
7200                 if (flags) {
7201                         *flags = 0;
7202
7203                         /* Get the condition of flags before requested ops */
7204                         /* are undertaken */
7205
7206                         if(dst_page->dirty) *flags |= UPL_POP_DIRTY;
7207                         if(dst_page->pageout) *flags |= UPL_POP_PAGEOUT;
7208                         if(dst_page->precious) *flags |= UPL_POP_PRECIOUS;
7209                         if(dst_page->absent) *flags |= UPL_POP_ABSENT;
7210                         if(dst_page->busy) *flags |= UPL_POP_BUSY;
7211                 }
7212
7213                 /* The caller should have made a call either contingent with */
7214                 /* or prior to this call to set UPL_POP_BUSY */
7215                 if(ops & UPL_POP_SET) {
7216                         /* The protection granted with this assert will */
7217                         /* not be complete.  If the caller violates the */
7218                         /* convention and attempts to change page state */
7219                         /* without first setting busy we may not see it */
7220                         /* because the page may already be busy.  However */
7221                         /* if such violations occur we will assert sooner */
7222                         /* or later. */
7223                         assert(dst_page->busy || (ops & UPL_POP_BUSY));
7224                         if (ops & UPL_POP_DIRTY) dst_page->dirty = TRUE;
7225                         if (ops & UPL_POP_PAGEOUT) dst_page->pageout = TRUE;
7226                         if (ops & UPL_POP_PRECIOUS) dst_page->precious = TRUE;
7227                         if (ops & UPL_POP_ABSENT) dst_page->absent = TRUE;
7228                         if (ops & UPL_POP_BUSY) dst_page->busy = TRUE;
7229                 }
7230
7231                 if(ops & UPL_POP_CLR) {
7232                         assert(dst_page->busy);
7233                         if (ops & UPL_POP_DIRTY) dst_page->dirty = FALSE;
7234                         if (ops & UPL_POP_PAGEOUT) dst_page->pageout = FALSE;
7235                         if (ops & UPL_POP_PRECIOUS) dst_page->precious = FALSE;
7236                         if (ops & UPL_POP_ABSENT) dst_page->absent = FALSE;
7237                         if (ops & UPL_POP_BUSY) {
7238                                 dst_page->busy = FALSE;
7239                                 PAGE_WAKEUP(dst_page);
7240                         }
7241                 }
7242
7243                 if (dst_page->encrypted) {
7244                         /*
7245                          * ENCRYPTED SWAP:
7246                          * We need to decrypt this encrypted page before the
7247                          * caller can access its contents.
7248                          * But if the caller really wants to access the page's
7249                          * contents, they have to keep the page "busy".
7250                          * Otherwise, the page could get recycled or re-encrypted
7251                          * at any time.
7252                          */
7253                         if ((ops & UPL_POP_SET) && (ops & UPL_POP_BUSY) &&
7254                             dst_page->busy) {
7255                                 /*
7256                                  * The page is stable enough to be accessed by
7257                                  * the caller, so make sure its contents are
7258                                  * not encrypted.
7259                                  */
7260                                 vm_page_decrypt(dst_page, 0);
7261                         } else {
7262                                 /*
7263                                  * The page is not busy, so don't bother
7264                                  * decrypting it, since anything could
7265                                  * happen to it between now and when the
7266                                  * caller wants to access it.
7267                                  * We should not give the caller access
7268                                  * to this page.
7269                                  */
7270                                 assert(!phys_entry);
7271                         }
7272                 }
7273
7274                 if (phys_entry) {
7275                         /*
7276                          * The physical page number will remain valid
7277                          * only if the page is kept busy.
7278                          * ENCRYPTED SWAP: make sure we don't let the
7279                          * caller access an encrypted page.
7280                          */
7281                         assert(dst_page->busy);
7282                         assert(!dst_page->encrypted);
7283                         *phys_entry = dst_page->phys_page;
7284                 }
7285
7286                 break;
7287         }
7288
7289         vm_object_unlock(object);
7290         return KERN_SUCCESS;
7291
7292 }
7293
7294 /*
7295  * vm_object_range_op offers performance enhancement over
7296  * vm_object_page_op for page_op functions which do not require page
7297  * level state to be returned from the call.  Page_op was created to provide
7298  * a low-cost alternative to page manipulation via UPLs when only a single
7299  * page was involved.  The range_op call establishes the ability in the _op
7300  * family of functions to work on multiple pages where the lack of page level
7301  * state handling allows the caller to avoid the overhead of the upl structures.
7302  */
7303
7304 kern_return_t
7305 vm_object_range_op(
7306         vm_object_t             object,
7307         vm_object_offset_t      offset_beg,
7308         vm_object_offset_t      offset_end,
7309         int                     ops,
7310         uint32_t                *range)
7311 {
7312         vm_object_offset_t      offset;
7313         vm_page_t               dst_page;
7314
7315         if (offset_end - offset_beg > (uint32_t) -1) {
7316                 /* range is too big and would overflow "*range" */
7317                 return KERN_INVALID_ARGUMENT;
7318         }
7319         if (object->resident_page_count == 0) {
7320                 if (range) {
7321                         if (ops & UPL_ROP_PRESENT) {
7322                                 *range = 0;
7323                         } else {
7324                                 *range = (uint32_t) (offset_end - offset_beg);
7325                                 assert(*range == (offset_end - offset_beg));
7326                         }
7327                 }
7328                 return KERN_SUCCESS;
7329         }
7330         vm_object_lock(object);
7331
7332         if (object->phys_contiguous) {
7333                 vm_object_unlock(object);
7334                 return KERN_INVALID_OBJECT;
7335         }
7336
7337         offset = offset_beg & ~PAGE_MASK_64;
7338
7339         while (offset < offset_end) {
7340                 dst_page = vm_page_lookup(object, offset);
7341                 if (dst_page != VM_PAGE_NULL) {
7342                         if (ops & UPL_ROP_DUMP) {
7343                                 if (dst_page->busy || dst_page->cleaning) {
7344                                         /*
7345                                          * someone else is playing with the
7346                                          * page, we will have to wait
7347                                          */
7348                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
7349                                         /*
7350                                          * need to relook the page up since it's
7351                                          * state may have changed while we slept
7352                                          * it might even belong to a different object
7353                                          * at this point
7354                                          */
7355                                         continue;
7356                                 }
7357                                 if (dst_page->pmapped == TRUE)
7358                                         pmap_disconnect(dst_page->phys_page);
7359
7360                                 VM_PAGE_FREE(dst_page);
7361
7362                         } else if ((ops & UPL_ROP_ABSENT) && !dst_page->absent)
7363                                 break;
7364                 } else if (ops & UPL_ROP_PRESENT)
7365                         break;
7366
7367                 offset += PAGE_SIZE;
7368         }
7369         vm_object_unlock(object);
7370
7371         if (range) {
7372                 if (offset > offset_end)
7373                         offset = offset_end;
7374                 if(offset > offset_beg) {
7375                         *range = (uint32_t) (offset - offset_beg);
7376                         assert(*range == (offset - offset_beg));
7377                 } else {
7378                         *range = 0;
7379                 }
7380         }
7381         return KERN_SUCCESS;
7382 }
7383
7384
7385 uint32_t scan_object_collision = 0;
7386
7387 void
7388 vm_object_lock(vm_object_t object)
7389 {
7390         if (object == vm_pageout_scan_wants_object) {
7391                 scan_object_collision++;
7392                 mutex_pause(2);
7393         }
7394         lck_rw_lock_exclusive(&object->Lock);
7395 }
7396
7397 boolean_t
7398 vm_object_lock_avoid(vm_object_t object)
7399 {
7400         if (object == vm_pageout_scan_wants_object) {
7401                 scan_object_collision++;
7402                 return TRUE;
7403         }
7404         return FALSE;
7405 }
7406
7407 boolean_t
7408 _vm_object_lock_try(vm_object_t object)
7409 {
7410         return (lck_rw_try_lock_exclusive(&object->Lock));
7411 }
7412
7413 boolean_t
7414 vm_object_lock_try(vm_object_t object)
7415 {
7416     // called from hibernate path so check before blocking
7417         if (vm_object_lock_avoid(object) && ml_get_interrupts_enabled()) {
7418                 mutex_pause(2);
7419         }
7420         return _vm_object_lock_try(object);
7421 }
7422 void
7423 vm_object_lock_shared(vm_object_t object)
7424 {
7425         if (vm_object_lock_avoid(object)) {
7426                 mutex_pause(2);
7427         }
7428         lck_rw_lock_shared(&object->Lock);
7429 }
7430
7431 boolean_t
7432 vm_object_lock_try_shared(vm_object_t object)
7433 {
7434         if (vm_object_lock_avoid(object)) {
7435                 mutex_pause(2);
7436         }
7437         return (lck_rw_try_lock_shared(&object->Lock));
7438 }