osfmk/vm/vm_object.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_object.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Virtual memory object module.
  63  */
  64
  65 #include <debug.h>
  66 #include <mach_pagemap.h>
  67 #include <task_swapper.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/memory_object.h>
  71 #include <mach/memory_object_default.h>
  72 #include <mach/memory_object_control_server.h>
  73 #include <mach/vm_param.h>
  74
  75 #include <mach/sdt.h>
  76
  77 #include <ipc/ipc_types.h>
  78 #include <ipc/ipc_port.h>
  79
  80 #include <kern/kern_types.h>
  81 #include <kern/assert.h>
  82 #include <kern/queue.h>
  83 #include <kern/xpr.h>
  84 #include <kern/kalloc.h>
  85 #include <kern/zalloc.h>
  86 #include <kern/host.h>
  87 #include <kern/host_statistics.h>
  88 #include <kern/processor.h>
  89 #include <kern/misc_protos.h>
  90
  91 #include <vm/memory_object.h>
  92 #include <vm/vm_compressor_pager.h>
  93 #include <vm/vm_fault.h>
  94 #include <vm/vm_map.h>
  95 #include <vm/vm_object.h>
  96 #include <vm/vm_page.h>
  97 #include <vm/vm_pageout.h>
  98 #include <vm/vm_protos.h>
  99 #include <vm/vm_purgeable_internal.h>
 100
 101 #include <vm/vm_compressor.h>
 102
 103 #if CONFIG_PHANTOM_CACHE
 104 #include <vm/vm_phantom_cache.h>
 105 #endif
 106
 107 boolean_t vm_object_collapse_compressor_allowed = TRUE;
 108
 109 struct vm_counters vm_counters;
 110
 111 #if VM_OBJECT_TRACKING
 112 boolean_t vm_object_tracking_inited = FALSE;
 113 decl_simple_lock_data(static,vm_object_tracking_lock_data);
 114 btlog_t *vm_object_tracking_btlog;
 115 static void
 116 vm_object_tracking_lock(void *context)
 117 {
 118         simple_lock((simple_lock_t)context);
 119 }
 120 static void
 121 vm_object_tracking_unlock(void *context)
 122 {
 123         simple_unlock((simple_lock_t)context);
 124 }
 125 void
 126 vm_object_tracking_init(void)
 127 {
 128         int vm_object_tracking;
 129
 130         vm_object_tracking = 1;
 131         PE_parse_boot_argn("vm_object_tracking", &vm_object_tracking,
 132                            sizeof (vm_object_tracking));
 133
 134         if (vm_object_tracking) {
 135                 simple_lock_init(&vm_object_tracking_lock_data, 0);
 136                 vm_object_tracking_btlog = btlog_create(
 137                         50000,
 138                         VM_OBJECT_TRACKING_BTDEPTH,
 139                         vm_object_tracking_lock,
 140                         vm_object_tracking_unlock,
 141                         &vm_object_tracking_lock_data);
 142                 assert(vm_object_tracking_btlog);
 143                 vm_object_tracking_inited = TRUE;
 144         }
 145 }
 146 #endif /* VM_OBJECT_TRACKING */
 147
 148 /*
 149  *      Virtual memory objects maintain the actual data
 150  *      associated with allocated virtual memory.  A given
 151  *      page of memory exists within exactly one object.
 152  *
 153  *      An object is only deallocated when all "references"
 154  *      are given up.
 155  *
 156  *      Associated with each object is a list of all resident
 157  *      memory pages belonging to that object; this list is
 158  *      maintained by the "vm_page" module, but locked by the object's
 159  *      lock.
 160  *
 161  *      Each object also records the memory object reference
 162  *      that is used by the kernel to request and write
 163  *      back data (the memory object, field "pager"), etc...
 164  *
 165  *      Virtual memory objects are allocated to provide
 166  *      zero-filled memory (vm_allocate) or map a user-defined
 167  *      memory object into a virtual address space (vm_map).
 168  *
 169  *      Virtual memory objects that refer to a user-defined
 170  *      memory object are called "permanent", because all changes
 171  *      made in virtual memory are reflected back to the
 172  *      memory manager, which may then store it permanently.
 173  *      Other virtual memory objects are called "temporary",
 174  *      meaning that changes need be written back only when
 175  *      necessary to reclaim pages, and that storage associated
 176  *      with the object can be discarded once it is no longer
 177  *      mapped.
 178  *
 179  *      A permanent memory object may be mapped into more
 180  *      than one virtual address space.  Moreover, two threads
 181  *      may attempt to make the first mapping of a memory
 182  *      object concurrently.  Only one thread is allowed to
 183  *      complete this mapping; all others wait for the
 184  *      "pager_initialized" field is asserted, indicating
 185  *      that the first thread has initialized all of the
 186  *      necessary fields in the virtual memory object structure.
 187  *
 188  *      The kernel relies on a *default memory manager* to
 189  *      provide backing storage for the zero-filled virtual
 190  *      memory objects.  The pager memory objects associated
 191  *      with these temporary virtual memory objects are only
 192  *      requested from the default memory manager when it
 193  *      becomes necessary.  Virtual memory objects
 194  *      that depend on the default memory manager are called
 195  *      "internal".  The "pager_created" field is provided to
 196  *      indicate whether these ports have ever been allocated.
 197  *
 198  *      The kernel may also create virtual memory objects to
 199  *      hold changed pages after a copy-on-write operation.
 200  *      In this case, the virtual memory object (and its
 201  *      backing storage -- its memory object) only contain
 202  *      those pages that have been changed.  The "shadow"
 203  *      field refers to the virtual memory object that contains
 204  *      the remainder of the contents.  The "shadow_offset"
 205  *      field indicates where in the "shadow" these contents begin.
 206  *      The "copy" field refers to a virtual memory object
 207  *      to which changed pages must be copied before changing
 208  *      this object, in order to implement another form
 209  *      of copy-on-write optimization.
 210  *
 211  *      The virtual memory object structure also records
 212  *      the attributes associated with its memory object.
 213  *      The "pager_ready", "can_persist" and "copy_strategy"
 214  *      fields represent those attributes.  The "cached_list"
 215  *      field is used in the implementation of the persistence
 216  *      attribute.
 217  *
 218  * ZZZ Continue this comment.
 219  */
 220
 221 /* Forward declarations for internal functions. */
 222 static kern_return_t    vm_object_terminate(
 223                                 vm_object_t     object);
 224
 225 extern void             vm_object_remove(
 226                                 vm_object_t     object);
 227
 228 static kern_return_t    vm_object_copy_call(
 229                                 vm_object_t             src_object,
 230                                 vm_object_offset_t      src_offset,
 231                                 vm_object_size_t        size,
 232                                 vm_object_t             *_result_object);
 233
 234 static void             vm_object_do_collapse(
 235                                 vm_object_t     object,
 236                                 vm_object_t     backing_object);
 237
 238 static void             vm_object_do_bypass(
 239                                 vm_object_t     object,
 240                                 vm_object_t     backing_object);
 241
 242 static void             vm_object_release_pager(
 243                                 memory_object_t pager,
 244                                 boolean_t       hashed);
 245
 246 static zone_t           vm_object_zone;         /* vm backing store zone */
 247
 248 /*
 249  *      All wired-down kernel memory belongs to a single virtual
 250  *      memory object (kernel_object) to avoid wasting data structures.
 251  */
 252 static struct vm_object                 kernel_object_store;
 253 vm_object_t                                             kernel_object;
 254
 255 static struct vm_object                 compressor_object_store;
 256 vm_object_t                             compressor_object = &compressor_object_store;
 257
 258 /*
 259  *      The submap object is used as a placeholder for vm_map_submap
 260  *      operations.  The object is declared in vm_map.c because it
 261  *      is exported by the vm_map module.  The storage is declared
 262  *      here because it must be initialized here.
 263  */
 264 static struct vm_object                 vm_submap_object_store;
 265
 266 /*
 267  *      Virtual memory objects are initialized from
 268  *      a template (see vm_object_allocate).
 269  *
 270  *      When adding a new field to the virtual memory
 271  *      object structure, be sure to add initialization
 272  *      (see _vm_object_allocate()).
 273  */
 274 static struct vm_object                 vm_object_template;
 275
 276 unsigned int vm_page_purged_wired = 0;
 277 unsigned int vm_page_purged_busy = 0;
 278 unsigned int vm_page_purged_others = 0;
 279
 280 #if VM_OBJECT_CACHE
 281 /*
 282  *      Virtual memory objects that are not referenced by
 283  *      any address maps, but that are allowed to persist
 284  *      (an attribute specified by the associated memory manager),
 285  *      are kept in a queue (vm_object_cached_list).
 286  *
 287  *      When an object from this queue is referenced again,
 288  *      for example to make another address space mapping,
 289  *      it must be removed from the queue.  That is, the
 290  *      queue contains *only* objects with zero references.
 291  *
 292  *      The kernel may choose to terminate objects from this
 293  *      queue in order to reclaim storage.  The current policy
 294  *      is to permit a fixed maximum number of unreferenced
 295  *      objects (vm_object_cached_max).
 296  *
 297  *      A spin lock (accessed by routines
 298  *      vm_object_cache_{lock,lock_try,unlock}) governs the
 299  *      object cache.  It must be held when objects are
 300  *      added to or removed from the cache (in vm_object_terminate).
 301  *      The routines that acquire a reference to a virtual
 302  *      memory object based on one of the memory object ports
 303  *      must also lock the cache.
 304  *
 305  *      Ideally, the object cache should be more isolated
 306  *      from the reference mechanism, so that the lock need
 307  *      not be held to make simple references.
 308  */
 309 static vm_object_t      vm_object_cache_trim(
 310                                 boolean_t called_from_vm_object_deallocate);
 311
 312 static void             vm_object_deactivate_all_pages(
 313                                 vm_object_t     object);
 314
 315 static int              vm_object_cached_high;  /* highest # cached objects */
 316 static int              vm_object_cached_max = 512;     /* may be patched*/
 317
 318 #define vm_object_cache_lock()          \
 319                 lck_mtx_lock(&vm_object_cached_lock_data)
 320 #define vm_object_cache_lock_try()              \
 321                 lck_mtx_try_lock(&vm_object_cached_lock_data)
 322
 323 #endif  /* VM_OBJECT_CACHE */
 324
 325 static queue_head_t     vm_object_cached_list;
 326 static uint32_t         vm_object_cache_pages_freed = 0;
 327 static uint32_t         vm_object_cache_pages_moved = 0;
 328 static uint32_t         vm_object_cache_pages_skipped = 0;
 329 static uint32_t         vm_object_cache_adds = 0;
 330 static uint32_t         vm_object_cached_count = 0;
 331 static lck_mtx_t        vm_object_cached_lock_data;
 332 static lck_mtx_ext_t    vm_object_cached_lock_data_ext;
 333
 334 static uint32_t         vm_object_page_grab_failed = 0;
 335 static uint32_t         vm_object_page_grab_skipped = 0;
 336 static uint32_t         vm_object_page_grab_returned = 0;
 337 static uint32_t         vm_object_page_grab_pmapped = 0;
 338 static uint32_t         vm_object_page_grab_reactivations = 0;
 339
 340 #define vm_object_cache_lock_spin()             \
 341                 lck_mtx_lock_spin(&vm_object_cached_lock_data)
 342 #define vm_object_cache_unlock()        \
 343                 lck_mtx_unlock(&vm_object_cached_lock_data)
 344
 345 static void     vm_object_cache_remove_locked(vm_object_t);
 346
 347
 348 #define VM_OBJECT_HASH_COUNT            1024
 349 #define VM_OBJECT_HASH_LOCK_COUNT       512
 350
 351 static lck_mtx_t        vm_object_hashed_lock_data[VM_OBJECT_HASH_LOCK_COUNT];
 352 static lck_mtx_ext_t    vm_object_hashed_lock_data_ext[VM_OBJECT_HASH_LOCK_COUNT];
 353
 354 static queue_head_t     vm_object_hashtable[VM_OBJECT_HASH_COUNT];
 355 static struct zone      *vm_object_hash_zone;
 356
 357 struct vm_object_hash_entry {
 358         queue_chain_t           hash_link;      /* hash chain link */
 359         memory_object_t pager;          /* pager we represent */
 360         vm_object_t             object;         /* corresponding object */
 361         boolean_t               waiting;        /* someone waiting for
 362                                                  * termination */
 363 };
 364
 365 typedef struct vm_object_hash_entry     *vm_object_hash_entry_t;
 366 #define VM_OBJECT_HASH_ENTRY_NULL       ((vm_object_hash_entry_t) 0)
 367
 368 #define VM_OBJECT_HASH_SHIFT    5
 369 #define vm_object_hash(pager) \
 370         ((int)((((uintptr_t)pager) >> VM_OBJECT_HASH_SHIFT) % VM_OBJECT_HASH_COUNT))
 371
 372 #define vm_object_lock_hash(pager) \
 373         ((int)((((uintptr_t)pager) >> VM_OBJECT_HASH_SHIFT) % VM_OBJECT_HASH_LOCK_COUNT))
 374
 375 void vm_object_hash_entry_free(
 376         vm_object_hash_entry_t  entry);
 377
 378 static void vm_object_reap(vm_object_t object);
 379 static void vm_object_reap_async(vm_object_t object);
 380 static void vm_object_reaper_thread(void);
 381
 382 static lck_mtx_t        vm_object_reaper_lock_data;
 383 static lck_mtx_ext_t    vm_object_reaper_lock_data_ext;
 384
 385 static queue_head_t vm_object_reaper_queue; /* protected by vm_object_reaper_lock() */
 386 unsigned int vm_object_reap_count = 0;
 387 unsigned int vm_object_reap_count_async = 0;
 388
 389 #define vm_object_reaper_lock()         \
 390                 lck_mtx_lock(&vm_object_reaper_lock_data)
 391 #define vm_object_reaper_lock_spin()            \
 392                 lck_mtx_lock_spin(&vm_object_reaper_lock_data)
 393 #define vm_object_reaper_unlock()       \
 394                 lck_mtx_unlock(&vm_object_reaper_lock_data)
 395
 396 #if CONFIG_IOSCHED
 397 /* I/O Re-prioritization request list */
 398 queue_head_t    io_reprioritize_list;
 399 lck_spin_t      io_reprioritize_list_lock;
 400
 401 #define IO_REPRIORITIZE_LIST_LOCK()     \
 402                 lck_spin_lock(&io_reprioritize_list_lock)
 403 #define IO_REPRIORITIZE_LIST_UNLOCK()   \
 404                 lck_spin_unlock(&io_reprioritize_list_lock)
 405
 406 #define MAX_IO_REPRIORITIZE_REQS        8192
 407 zone_t          io_reprioritize_req_zone;
 408
 409 /* I/O Re-prioritization thread */
 410 int io_reprioritize_wakeup = 0;
 411 static void io_reprioritize_thread(void *param __unused, wait_result_t wr __unused);
 412
 413 #define IO_REPRIO_THREAD_WAKEUP()       thread_wakeup((event_t)&io_reprioritize_wakeup)
 414 #define IO_REPRIO_THREAD_CONTINUATION()                                 \
 415 {                                                               \
 416         assert_wait(&io_reprioritize_wakeup, THREAD_UNINT);     \
 417         thread_block(io_reprioritize_thread);                   \
 418 }
 419
 420 void vm_page_request_reprioritize(vm_object_t, uint64_t, uint32_t, int);
 421 void vm_page_handle_prio_inversion(vm_object_t, vm_page_t);
 422 void vm_decmp_upl_reprioritize(upl_t, int);
 423 #endif
 424
 425 #if 0
 426 #undef KERNEL_DEBUG
 427 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
 428 #endif
 429
 430
 431 static lck_mtx_t *
 432 vm_object_hash_lock_spin(
 433         memory_object_t pager)
 434 {
 435         int     index;
 436
 437         index = vm_object_lock_hash(pager);
 438
 439         lck_mtx_lock_spin(&vm_object_hashed_lock_data[index]);
 440
 441         return (&vm_object_hashed_lock_data[index]);
 442 }
 443
 444 static void
 445 vm_object_hash_unlock(lck_mtx_t *lck)
 446 {
 447         lck_mtx_unlock(lck);
 448 }
 449
 450
 451 /*
 452  *      vm_object_hash_lookup looks up a pager in the hashtable
 453  *      and returns the corresponding entry, with optional removal.
 454  */
 455 static vm_object_hash_entry_t
 456 vm_object_hash_lookup(
 457         memory_object_t pager,
 458         boolean_t       remove_entry)
 459 {
 460         queue_t                 bucket;
 461         vm_object_hash_entry_t  entry;
 462
 463         bucket = &vm_object_hashtable[vm_object_hash(pager)];
 464
 465         entry = (vm_object_hash_entry_t)queue_first(bucket);
 466         while (!queue_end(bucket, (queue_entry_t)entry)) {
 467                 if (entry->pager == pager) {
 468                         if (remove_entry) {
 469                                 queue_remove(bucket, entry,
 470                                              vm_object_hash_entry_t, hash_link);
 471                         }
 472                         return(entry);
 473                 }
 474                 entry = (vm_object_hash_entry_t)queue_next(&entry->hash_link);
 475         }
 476         return(VM_OBJECT_HASH_ENTRY_NULL);
 477 }
 478
 479 /*
 480  *      vm_object_hash_enter enters the specified
 481  *      pager / cache object association in the hashtable.
 482  */
 483
 484 static void
 485 vm_object_hash_insert(
 486         vm_object_hash_entry_t  entry,
 487         vm_object_t             object)
 488 {
 489         queue_t         bucket;
 490
 491         vm_object_lock_assert_exclusive(object);
 492
 493         bucket = &vm_object_hashtable[vm_object_hash(entry->pager)];
 494
 495         queue_enter(bucket, entry, vm_object_hash_entry_t, hash_link);
 496
 497         entry->object = object;
 498         object->hashed = TRUE;
 499 }
 500
 501 static vm_object_hash_entry_t
 502 vm_object_hash_entry_alloc(
 503         memory_object_t pager)
 504 {
 505         vm_object_hash_entry_t  entry;
 506
 507         entry = (vm_object_hash_entry_t)zalloc(vm_object_hash_zone);
 508         entry->pager = pager;
 509         entry->object = VM_OBJECT_NULL;
 510         entry->waiting = FALSE;
 511
 512         return(entry);
 513 }
 514
 515 void
 516 vm_object_hash_entry_free(
 517         vm_object_hash_entry_t  entry)
 518 {
 519         zfree(vm_object_hash_zone, entry);
 520 }
 521
 522 /*
 523  *      vm_object_allocate:
 524  *
 525  *      Returns a new object with the given size.
 526  */
 527
 528 __private_extern__ void
 529 _vm_object_allocate(
 530         vm_object_size_t        size,
 531         vm_object_t             object)
 532 {
 533         XPR(XPR_VM_OBJECT,
 534                 "vm_object_allocate, object 0x%X size 0x%X\n",
 535                 object, size, 0,0,0);
 536
 537         *object = vm_object_template;
 538         queue_init(&object->memq);
 539         queue_init(&object->msr_q);
 540 #if UPL_DEBUG || CONFIG_IOSCHED
 541         queue_init(&object->uplq);
 542 #endif
 543         vm_object_lock_init(object);
 544         object->vo_size = size;
 545
 546 #if VM_OBJECT_TRACKING_OP_CREATED
 547         if (vm_object_tracking_inited) {
 548                 void    *bt[VM_OBJECT_TRACKING_BTDEPTH];
 549                 int     numsaved = 0;
 550
 551                 numsaved = OSBacktrace(bt, VM_OBJECT_TRACKING_BTDEPTH);
 552                 btlog_add_entry(vm_object_tracking_btlog,
 553                                 object,
 554                                 VM_OBJECT_TRACKING_OP_CREATED,
 555                                 bt,
 556                                 numsaved);
 557         }
 558 #endif /* VM_OBJECT_TRACKING_OP_CREATED */
 559 }
 560
 561 __private_extern__ vm_object_t
 562 vm_object_allocate(
 563         vm_object_size_t        size)
 564 {
 565         register vm_object_t object;
 566
 567         object = (vm_object_t) zalloc(vm_object_zone);
 568
 569 //      dbgLog(object, size, 0, 2);                     /* (TEST/DEBUG) */
 570
 571         if (object != VM_OBJECT_NULL)
 572                 _vm_object_allocate(size, object);
 573
 574         return object;
 575 }
 576
 577
 578 lck_grp_t               vm_object_lck_grp;
 579 lck_grp_t               vm_object_cache_lck_grp;
 580 lck_grp_attr_t          vm_object_lck_grp_attr;
 581 lck_attr_t              vm_object_lck_attr;
 582 lck_attr_t              kernel_object_lck_attr;
 583 lck_attr_t              compressor_object_lck_attr;
 584
 585 /*
 586  *      vm_object_bootstrap:
 587  *
 588  *      Initialize the VM objects module.
 589  */
 590 __private_extern__ void
 591 vm_object_bootstrap(void)
 592 {
 593         register int    i;
 594
 595         vm_object_zone = zinit((vm_size_t) sizeof(struct vm_object),
 596                                 round_page(512*1024),
 597                                 round_page(12*1024),
 598                                 "vm objects");
 599         zone_change(vm_object_zone, Z_CALLERACCT, FALSE); /* don't charge caller */
 600         zone_change(vm_object_zone, Z_NOENCRYPT, TRUE);
 601
 602         vm_object_init_lck_grp();
 603
 604         queue_init(&vm_object_cached_list);
 605
 606         lck_mtx_init_ext(&vm_object_cached_lock_data,
 607                 &vm_object_cached_lock_data_ext,
 608                 &vm_object_cache_lck_grp,
 609                 &vm_object_lck_attr);
 610
 611         queue_init(&vm_object_reaper_queue);
 612
 613         for (i = 0; i < VM_OBJECT_HASH_LOCK_COUNT; i++) {
 614                 lck_mtx_init_ext(&vm_object_hashed_lock_data[i],
 615                                  &vm_object_hashed_lock_data_ext[i],
 616                                  &vm_object_lck_grp,
 617                                  &vm_object_lck_attr);
 618         }
 619         lck_mtx_init_ext(&vm_object_reaper_lock_data,
 620                 &vm_object_reaper_lock_data_ext,
 621                 &vm_object_lck_grp,
 622                 &vm_object_lck_attr);
 623
 624         vm_object_hash_zone =
 625                         zinit((vm_size_t) sizeof (struct vm_object_hash_entry),
 626                               round_page(512*1024),
 627                               round_page(12*1024),
 628                               "vm object hash entries");
 629         zone_change(vm_object_hash_zone, Z_CALLERACCT, FALSE);
 630         zone_change(vm_object_hash_zone, Z_NOENCRYPT, TRUE);
 631
 632         for (i = 0; i < VM_OBJECT_HASH_COUNT; i++)
 633                 queue_init(&vm_object_hashtable[i]);
 634
 635
 636         /*
 637          *      Fill in a template object, for quick initialization
 638          */
 639
 640         /* memq; Lock; init after allocation */
 641         vm_object_template.memq.prev = NULL;
 642         vm_object_template.memq.next = NULL;
 643 #if 0
 644         /*
 645          * We can't call vm_object_lock_init() here because that will
 646          * allocate some memory and VM is not fully initialized yet.
 647          * The lock will be initialized for each allocated object in
 648          * _vm_object_allocate(), so we don't need to initialize it in
 649          * the vm_object_template.
 650          */
 651         vm_object_lock_init(&vm_object_template);
 652 #endif
 653         vm_object_template.vo_size = 0;
 654         vm_object_template.memq_hint = VM_PAGE_NULL;
 655         vm_object_template.ref_count = 1;
 656 #if     TASK_SWAPPER
 657         vm_object_template.res_count = 1;
 658 #endif  /* TASK_SWAPPER */
 659         vm_object_template.resident_page_count = 0;
 660         vm_object_template.wired_page_count = 0;
 661         vm_object_template.reusable_page_count = 0;
 662         vm_object_template.copy = VM_OBJECT_NULL;
 663         vm_object_template.shadow = VM_OBJECT_NULL;
 664         vm_object_template.vo_shadow_offset = (vm_object_offset_t) 0;
 665         vm_object_template.pager = MEMORY_OBJECT_NULL;
 666         vm_object_template.paging_offset = 0;
 667         vm_object_template.pager_control = MEMORY_OBJECT_CONTROL_NULL;
 668         vm_object_template.copy_strategy = MEMORY_OBJECT_COPY_SYMMETRIC;
 669         vm_object_template.paging_in_progress = 0;
 670 #if __LP64__
 671         vm_object_template.__object1_unused_bits = 0;
 672 #endif /* __LP64__ */
 673         vm_object_template.activity_in_progress = 0;
 674
 675         /* Begin bitfields */
 676         vm_object_template.all_wanted = 0; /* all bits FALSE */
 677         vm_object_template.pager_created = FALSE;
 678         vm_object_template.pager_initialized = FALSE;
 679         vm_object_template.pager_ready = FALSE;
 680         vm_object_template.pager_trusted = FALSE;
 681         vm_object_template.can_persist = FALSE;
 682         vm_object_template.internal = TRUE;
 683         vm_object_template.temporary = TRUE;
 684         vm_object_template.private = FALSE;
 685         vm_object_template.pageout = FALSE;
 686         vm_object_template.alive = TRUE;
 687         vm_object_template.purgable = VM_PURGABLE_DENY;
 688         vm_object_template.purgeable_when_ripe = FALSE;
 689         vm_object_template.shadowed = FALSE;
 690         vm_object_template.advisory_pageout = FALSE;
 691         vm_object_template.true_share = FALSE;
 692         vm_object_template.terminating = FALSE;
 693         vm_object_template.named = FALSE;
 694         vm_object_template.shadow_severed = FALSE;
 695         vm_object_template.phys_contiguous = FALSE;
 696         vm_object_template.nophyscache = FALSE;
 697         /* End bitfields */
 698
 699         vm_object_template.cached_list.prev = NULL;
 700         vm_object_template.cached_list.next = NULL;
 701         vm_object_template.msr_q.prev = NULL;
 702         vm_object_template.msr_q.next = NULL;
 703
 704         vm_object_template.last_alloc = (vm_object_offset_t) 0;
 705         vm_object_template.sequential = (vm_object_offset_t) 0;
 706         vm_object_template.pages_created = 0;
 707         vm_object_template.pages_used = 0;
 708         vm_object_template.scan_collisions = 0;
 709 #if CONFIG_PHANTOM_CACHE
 710         vm_object_template.phantom_object_id = 0;
 711 #endif
 712 #if     MACH_PAGEMAP
 713         vm_object_template.existence_map = VM_EXTERNAL_NULL;
 714 #endif  /* MACH_PAGEMAP */
 715         vm_object_template.cow_hint = ~(vm_offset_t)0;
 716 #if     MACH_ASSERT
 717         vm_object_template.paging_object = VM_OBJECT_NULL;
 718 #endif  /* MACH_ASSERT */
 719
 720         /* cache bitfields */
 721         vm_object_template.wimg_bits = VM_WIMG_USE_DEFAULT;
 722         vm_object_template.set_cache_attr = FALSE;
 723         vm_object_template.object_slid = FALSE;
 724         vm_object_template.code_signed = FALSE;
 725         vm_object_template.hashed = FALSE;
 726         vm_object_template.transposed = FALSE;
 727         vm_object_template.mapping_in_progress = FALSE;
 728         vm_object_template.phantom_isssd = FALSE;
 729         vm_object_template.volatile_empty = FALSE;
 730         vm_object_template.volatile_fault = FALSE;
 731         vm_object_template.all_reusable = FALSE;
 732         vm_object_template.blocked_access = FALSE;
 733         vm_object_template.__object2_unused_bits = 0;
 734 #if CONFIG_IOSCHED || UPL_DEBUG
 735         vm_object_template.uplq.prev = NULL;
 736         vm_object_template.uplq.next = NULL;
 737 #endif /* UPL_DEBUG */
 738 #ifdef VM_PIP_DEBUG
 739         bzero(&vm_object_template.pip_holders,
 740               sizeof (vm_object_template.pip_holders));
 741 #endif /* VM_PIP_DEBUG */
 742
 743         vm_object_template.objq.next = NULL;
 744         vm_object_template.objq.prev = NULL;
 745
 746         vm_object_template.purgeable_queue_type = PURGEABLE_Q_TYPE_MAX;
 747         vm_object_template.purgeable_queue_group = 0;
 748
 749         vm_object_template.vo_cache_ts = 0;
 750
 751 #if DEBUG
 752         bzero(&vm_object_template.purgeable_owner_bt[0],
 753               sizeof (vm_object_template.purgeable_owner_bt));
 754         vm_object_template.vo_purgeable_volatilizer = NULL;
 755         bzero(&vm_object_template.purgeable_volatilizer_bt[0],
 756               sizeof (vm_object_template.purgeable_volatilizer_bt));
 757 #endif /* DEBUG */
 758
 759         /*
 760          *      Initialize the "kernel object"
 761          */
 762
 763         kernel_object = &kernel_object_store;
 764
 765 /*
 766  *      Note that in the following size specifications, we need to add 1 because
 767  *      VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size.
 768  */
 769
 770 #ifdef ppc
 771         _vm_object_allocate(vm_last_addr + 1,
 772                             kernel_object);
 773 #else
 774         _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1,
 775                             kernel_object);
 776
 777         _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1,
 778                             compressor_object);
 779 #endif
 780         kernel_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 781         compressor_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 782
 783         /*
 784          *      Initialize the "submap object".  Make it as large as the
 785          *      kernel object so that no limit is imposed on submap sizes.
 786          */
 787
 788         vm_submap_object = &vm_submap_object_store;
 789 #ifdef ppc
 790         _vm_object_allocate(vm_last_addr + 1,
 791                             vm_submap_object);
 792 #else
 793         _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1,
 794                             vm_submap_object);
 795 #endif
 796         vm_submap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 797
 798         /*
 799          * Create an "extra" reference to this object so that we never
 800          * try to deallocate it; zfree doesn't like to be called with
 801          * non-zone memory.
 802          */
 803         vm_object_reference(vm_submap_object);
 804
 805 #if     MACH_PAGEMAP
 806         vm_external_module_initialize();
 807 #endif  /* MACH_PAGEMAP */
 808 }
 809
 810 #if CONFIG_IOSCHED
 811 void
 812 vm_io_reprioritize_init(void)
 813 {
 814         kern_return_t   result;
 815         thread_t        thread = THREAD_NULL;
 816
 817         /* Initialze the I/O reprioritization subsystem */
 818         lck_spin_init(&io_reprioritize_list_lock, &vm_object_lck_grp, &vm_object_lck_attr);
 819         queue_init(&io_reprioritize_list);
 820
 821         io_reprioritize_req_zone = zinit(sizeof(struct io_reprioritize_req),
 822                                          MAX_IO_REPRIORITIZE_REQS * sizeof(struct io_reprioritize_req),
 823                                          4096, "io_reprioritize_req");
 824
 825         result = kernel_thread_start_priority(io_reprioritize_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread);
 826         if (result == KERN_SUCCESS) {
 827                 thread_deallocate(thread);
 828         } else {
 829                 panic("Could not create io_reprioritize_thread");
 830         }
 831 }
 832 #endif
 833
 834 void
 835 vm_object_reaper_init(void)
 836 {
 837         kern_return_t   kr;
 838         thread_t        thread;
 839
 840         kr = kernel_thread_start_priority(
 841                 (thread_continue_t) vm_object_reaper_thread,
 842                 NULL,
 843                 BASEPRI_PREEMPT - 1,
 844                 &thread);
 845         if (kr != KERN_SUCCESS) {
 846                 panic("failed to launch vm_object_reaper_thread kr=0x%x", kr);
 847         }
 848         thread_deallocate(thread);
 849 }
 850
 851 __private_extern__ void
 852 vm_object_init(void)
 853 {
 854         /*
 855          *      Finish initializing the kernel object.
 856          */
 857 }
 858
 859
 860 __private_extern__ void
 861 vm_object_init_lck_grp(void)
 862 {
 863         /*
 864          * initialze the vm_object lock world
 865          */
 866         lck_grp_attr_setdefault(&vm_object_lck_grp_attr);
 867         lck_grp_init(&vm_object_lck_grp, "vm_object", &vm_object_lck_grp_attr);
 868         lck_grp_init(&vm_object_cache_lck_grp, "vm_object_cache", &vm_object_lck_grp_attr);
 869         lck_attr_setdefault(&vm_object_lck_attr);
 870         lck_attr_setdefault(&kernel_object_lck_attr);
 871         lck_attr_cleardebug(&kernel_object_lck_attr);
 872         lck_attr_setdefault(&compressor_object_lck_attr);
 873         lck_attr_cleardebug(&compressor_object_lck_attr);
 874 }
 875
 876 #if VM_OBJECT_CACHE
 877 #define MIGHT_NOT_CACHE_SHADOWS         1
 878 #if     MIGHT_NOT_CACHE_SHADOWS
 879 static int cache_shadows = TRUE;
 880 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
 881 #endif
 882
 883 /*
 884  *      vm_object_deallocate:
 885  *
 886  *      Release a reference to the specified object,
 887  *      gained either through a vm_object_allocate
 888  *      or a vm_object_reference call.  When all references
 889  *      are gone, storage associated with this object
 890  *      may be relinquished.
 891  *
 892  *      No object may be locked.
 893  */
 894 unsigned long vm_object_deallocate_shared_successes = 0;
 895 unsigned long vm_object_deallocate_shared_failures = 0;
 896 unsigned long vm_object_deallocate_shared_swap_failures = 0;
 897 __private_extern__ void
 898 vm_object_deallocate(
 899         register vm_object_t    object)
 900 {
 901 #if VM_OBJECT_CACHE
 902         boolean_t       retry_cache_trim = FALSE;
 903         uint32_t        try_failed_count = 0;
 904 #endif
 905         vm_object_t     shadow = VM_OBJECT_NULL;
 906
 907 //      if(object)dbgLog(object, object->ref_count, object->can_persist, 3);    /* (TEST/DEBUG) */
 908 //      else dbgLog(object, 0, 0, 3);   /* (TEST/DEBUG) */
 909
 910         if (object == VM_OBJECT_NULL)
 911                 return;
 912
 913         if (object == kernel_object || object == compressor_object) {
 914                 vm_object_lock_shared(object);
 915
 916                 OSAddAtomic(-1, &object->ref_count);
 917
 918                 if (object->ref_count == 0) {
 919                         if (object == kernel_object)
 920                                 panic("vm_object_deallocate: losing kernel_object\n");
 921                         else
 922                                 panic("vm_object_deallocate: losing compressor_object\n");
 923                 }
 924                 vm_object_unlock(object);
 925                 return;
 926         }
 927
 928         if (object->ref_count == 2 &&
 929             object->named) {
 930                 /*
 931                  * This "named" object's reference count is about to
 932                  * drop from 2 to 1:
 933                  * we'll need to call memory_object_last_unmap().
 934                  */
 935         } else if (object->ref_count == 2 &&
 936                    object->internal &&
 937                    object->shadow != VM_OBJECT_NULL) {
 938                 /*
 939                  * This internal object's reference count is about to
 940                  * drop from 2 to 1 and it has a shadow object:
 941                  * we'll want to try and collapse this object with its
 942                  * shadow.
 943                  */
 944         } else if (object->ref_count >= 2) {
 945                 UInt32          original_ref_count;
 946                 volatile UInt32 *ref_count_p;
 947                 Boolean         atomic_swap;
 948
 949                 /*
 950                  * The object currently looks like it is not being
 951                  * kept alive solely by the reference we're about to release.
 952                  * Let's try and release our reference without taking
 953                  * all the locks we would need if we had to terminate the
 954                  * object (cache lock + exclusive object lock).
 955                  * Lock the object "shared" to make sure we don't race with
 956                  * anyone holding it "exclusive".
 957                  */
 958                 vm_object_lock_shared(object);
 959                 ref_count_p = (volatile UInt32 *) &object->ref_count;
 960                 original_ref_count = object->ref_count;
 961                 /*
 962                  * Test again as "ref_count" could have changed.
 963                  * "named" shouldn't change.
 964                  */
 965                 if (original_ref_count == 2 &&
 966                     object->named) {
 967                         /* need to take slow path for m_o_last_unmap() */
 968                         atomic_swap = FALSE;
 969                 } else if (original_ref_count == 2 &&
 970                            object->internal &&
 971                            object->shadow != VM_OBJECT_NULL) {
 972                         /* need to take slow path for vm_object_collapse() */
 973                         atomic_swap = FALSE;
 974                 } else if (original_ref_count < 2) {
 975                         /* need to take slow path for vm_object_terminate() */
 976                         atomic_swap = FALSE;
 977                 } else {
 978                         /* try an atomic update with the shared lock */
 979                         atomic_swap = OSCompareAndSwap(
 980                                 original_ref_count,
 981                                 original_ref_count - 1,
 982                                 (UInt32 *) &object->ref_count);
 983                         if (atomic_swap == FALSE) {
 984                                 vm_object_deallocate_shared_swap_failures++;
 985                                 /* fall back to the slow path... */
 986                         }
 987                 }
 988
 989                 vm_object_unlock(object);
 990
 991                 if (atomic_swap) {
 992                         /*
 993                          * ref_count was updated atomically !
 994                          */
 995                         vm_object_deallocate_shared_successes++;
 996                         return;
 997                 }
 998
 999                 /*
1000                  * Someone else updated the ref_count at the same
1001                  * time and we lost the race.  Fall back to the usual
1002                  * slow but safe path...
1003                  */
1004                 vm_object_deallocate_shared_failures++;
1005         }
1006
1007         while (object != VM_OBJECT_NULL) {
1008
1009                 vm_object_lock(object);
1010
1011                 assert(object->ref_count > 0);
1012
1013                 /*
1014                  *      If the object has a named reference, and only
1015                  *      that reference would remain, inform the pager
1016                  *      about the last "mapping" reference going away.
1017                  */
1018                 if ((object->ref_count == 2)  && (object->named)) {
1019                         memory_object_t pager = object->pager;
1020
1021                         /* Notify the Pager that there are no */
1022                         /* more mappers for this object */
1023
1024                         if (pager != MEMORY_OBJECT_NULL) {
1025                                 vm_object_mapping_wait(object, THREAD_UNINT);
1026                                 vm_object_mapping_begin(object);
1027                                 vm_object_unlock(object);
1028
1029                                 memory_object_last_unmap(pager);
1030
1031                                 vm_object_lock(object);
1032                                 vm_object_mapping_end(object);
1033                         }
1034                         assert(object->ref_count > 0);
1035                 }
1036
1037                 /*
1038                  *      Lose the reference. If other references
1039                  *      remain, then we are done, unless we need
1040                  *      to retry a cache trim.
1041                  *      If it is the last reference, then keep it
1042                  *      until any pending initialization is completed.
1043                  */
1044
1045                 /* if the object is terminating, it cannot go into */
1046                 /* the cache and we obviously should not call      */
1047                 /* terminate again.  */
1048
1049                 if ((object->ref_count > 1) || object->terminating) {
1050                         vm_object_lock_assert_exclusive(object);
1051                         object->ref_count--;
1052                         vm_object_res_deallocate(object);
1053
1054                         if (object->ref_count == 1 &&
1055                             object->shadow != VM_OBJECT_NULL) {
1056                                 /*
1057                                  * There's only one reference left on this
1058                                  * VM object.  We can't tell if it's a valid
1059                                  * one (from a mapping for example) or if this
1060                                  * object is just part of a possibly stale and
1061                                  * useless shadow chain.
1062                                  * We would like to try and collapse it into
1063                                  * its parent, but we don't have any pointers
1064                                  * back to this parent object.
1065                                  * But we can try and collapse this object with
1066                                  * its own shadows, in case these are useless
1067                                  * too...
1068                                  * We can't bypass this object though, since we
1069                                  * don't know if this last reference on it is
1070                                  * meaningful or not.
1071                                  */
1072                                 vm_object_collapse(object, 0, FALSE);
1073                         }
1074                         vm_object_unlock(object);
1075 #if VM_OBJECT_CACHE
1076                         if (retry_cache_trim &&
1077                             ((object = vm_object_cache_trim(TRUE)) !=
1078                              VM_OBJECT_NULL)) {
1079                                 continue;
1080                         }
1081 #endif
1082                         return;
1083                 }
1084
1085                 /*
1086                  *      We have to wait for initialization
1087                  *      before destroying or caching the object.
1088                  */
1089
1090                 if (object->pager_created && ! object->pager_initialized) {
1091                         assert(! object->can_persist);
1092                         vm_object_assert_wait(object,
1093                                               VM_OBJECT_EVENT_INITIALIZED,
1094                                               THREAD_UNINT);
1095                         vm_object_unlock(object);
1096
1097                         thread_block(THREAD_CONTINUE_NULL);
1098                         continue;
1099                 }
1100
1101 #if VM_OBJECT_CACHE
1102                 /*
1103                  *      If this object can persist, then enter it in
1104                  *      the cache. Otherwise, terminate it.
1105                  *
1106                  *      NOTE:  Only permanent objects are cached, and
1107                  *      permanent objects cannot have shadows.  This
1108                  *      affects the residence counting logic in a minor
1109                  *      way (can do it in-line, mostly).
1110                  */
1111
1112                 if ((object->can_persist) && (object->alive)) {
1113                         /*
1114                          *      Now it is safe to decrement reference count,
1115                          *      and to return if reference count is > 0.
1116                          */
1117
1118                         vm_object_lock_assert_exclusive(object);
1119                         if (--object->ref_count > 0) {
1120                                 vm_object_res_deallocate(object);
1121                                 vm_object_unlock(object);
1122
1123                                 if (retry_cache_trim &&
1124                                     ((object = vm_object_cache_trim(TRUE)) !=
1125                                      VM_OBJECT_NULL)) {
1126                                         continue;
1127                                 }
1128                                 return;
1129                         }
1130
1131 #if     MIGHT_NOT_CACHE_SHADOWS
1132                         /*
1133                          *      Remove shadow now if we don't
1134                          *      want to cache shadows.
1135                          */
1136                         if (! cache_shadows) {
1137                                 shadow = object->shadow;
1138                                 object->shadow = VM_OBJECT_NULL;
1139                         }
1140 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
1141
1142                         /*
1143                          *      Enter the object onto the queue of
1144                          *      cached objects, and deactivate
1145                          *      all of its pages.
1146                          */
1147                         assert(object->shadow == VM_OBJECT_NULL);
1148                         VM_OBJ_RES_DECR(object);
1149                         XPR(XPR_VM_OBJECT,
1150                       "vm_o_deallocate: adding %x to cache, queue = (%x, %x)\n",
1151                                 object,
1152                                 vm_object_cached_list.next,
1153                                 vm_object_cached_list.prev,0,0);
1154
1155
1156                         vm_object_unlock(object);
1157
1158                         try_failed_count = 0;
1159                         for (;;) {
1160                                 vm_object_cache_lock();
1161
1162                                 /*
1163                                  * if we try to take a regular lock here
1164                                  * we risk deadlocking against someone
1165                                  * holding a lock on this object while
1166                                  * trying to vm_object_deallocate a different
1167                                  * object
1168                                  */
1169                                 if (vm_object_lock_try(object))
1170                                         break;
1171                                 vm_object_cache_unlock();
1172                                 try_failed_count++;
1173
1174                                 mutex_pause(try_failed_count);  /* wait a bit */
1175                         }
1176                         vm_object_cached_count++;
1177                         if (vm_object_cached_count > vm_object_cached_high)
1178                                 vm_object_cached_high = vm_object_cached_count;
1179                         queue_enter(&vm_object_cached_list, object,
1180                                 vm_object_t, cached_list);
1181                         vm_object_cache_unlock();
1182
1183                         vm_object_deactivate_all_pages(object);
1184                         vm_object_unlock(object);
1185
1186 #if     MIGHT_NOT_CACHE_SHADOWS
1187                         /*
1188                          *      If we have a shadow that we need
1189                          *      to deallocate, do so now, remembering
1190                          *      to trim the cache later.
1191                          */
1192                         if (! cache_shadows && shadow != VM_OBJECT_NULL) {
1193                                 object = shadow;
1194                                 retry_cache_trim = TRUE;
1195                                 continue;
1196                         }
1197 #endif  /* MIGHT_NOT_CACHE_SHADOWS */
1198
1199                         /*
1200                          *      Trim the cache. If the cache trim
1201                          *      returns with a shadow for us to deallocate,
1202                          *      then remember to retry the cache trim
1203                          *      when we are done deallocating the shadow.
1204                          *      Otherwise, we are done.
1205                          */
1206
1207                         object = vm_object_cache_trim(TRUE);
1208                         if (object == VM_OBJECT_NULL) {
1209                                 return;
1210                         }
1211                         retry_cache_trim = TRUE;
1212                 } else
1213 #endif  /* VM_OBJECT_CACHE */
1214                 {
1215                         /*
1216                          *      This object is not cachable; terminate it.
1217                          */
1218                         XPR(XPR_VM_OBJECT,
1219          "vm_o_deallocate: !cacheable 0x%X res %d paging_ops %d thread 0x%p ref %d\n",
1220                             object, object->resident_page_count,
1221                             object->paging_in_progress,
1222                             (void *)current_thread(),object->ref_count);
1223
1224                         VM_OBJ_RES_DECR(object);        /* XXX ? */
1225                         /*
1226                          *      Terminate this object. If it had a shadow,
1227                          *      then deallocate it; otherwise, if we need
1228                          *      to retry a cache trim, do so now; otherwise,
1229                          *      we are done. "pageout" objects have a shadow,
1230                          *      but maintain a "paging reference" rather than
1231                          *      a normal reference.
1232                          */
1233                         shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
1234
1235                         if (vm_object_terminate(object) != KERN_SUCCESS) {
1236                                 return;
1237                         }
1238                         if (shadow != VM_OBJECT_NULL) {
1239                                 object = shadow;
1240                                 continue;
1241                         }
1242 #if VM_OBJECT_CACHE
1243                         if (retry_cache_trim &&
1244                             ((object = vm_object_cache_trim(TRUE)) !=
1245                              VM_OBJECT_NULL)) {
1246                                 continue;
1247                         }
1248 #endif
1249                         return;
1250                 }
1251         }
1252 #if VM_OBJECT_CACHE
1253         assert(! retry_cache_trim);
1254 #endif
1255 }
1256
1257
1258
1259 vm_page_t
1260 vm_object_page_grab(
1261         vm_object_t     object)
1262 {
1263         vm_page_t       p, next_p;
1264         int             p_limit = 0;
1265         int             p_skipped = 0;
1266
1267         vm_object_lock_assert_exclusive(object);
1268
1269         next_p = (vm_page_t)queue_first(&object->memq);
1270         p_limit = MIN(50, object->resident_page_count);
1271
1272         while (!queue_end(&object->memq, (queue_entry_t)next_p) && --p_limit > 0) {
1273
1274                 p = next_p;
1275                 next_p = (vm_page_t)queue_next(&next_p->listq);
1276
1277                 if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry || p->fictitious)
1278                         goto move_page_in_obj;
1279
1280                 if (p->pmapped || p->dirty || p->precious) {
1281                         vm_page_lockspin_queues();
1282
1283                         if (p->pmapped) {
1284                                 int refmod_state;
1285
1286                                 vm_object_page_grab_pmapped++;
1287
1288                                 if (p->reference == FALSE || p->dirty == FALSE) {
1289
1290                                         refmod_state = pmap_get_refmod(p->phys_page);
1291
1292                                         if (refmod_state & VM_MEM_REFERENCED)
1293                                                 p->reference = TRUE;
1294                                         if (refmod_state & VM_MEM_MODIFIED) {
1295                                                 SET_PAGE_DIRTY(p, FALSE);
1296                                         }
1297                                 }
1298                                 if (p->dirty == FALSE && p->precious == FALSE) {
1299
1300                                         refmod_state = pmap_disconnect(p->phys_page);
1301
1302                                         if (refmod_state & VM_MEM_REFERENCED)
1303                                                 p->reference = TRUE;
1304                                         if (refmod_state & VM_MEM_MODIFIED) {
1305                                                 SET_PAGE_DIRTY(p, FALSE);
1306                                         }
1307
1308                                         if (p->dirty == FALSE)
1309                                                 goto take_page;
1310                                 }
1311                         }
1312                         if (p->inactive && p->reference == TRUE) {
1313                                 vm_page_activate(p);
1314
1315                                 VM_STAT_INCR(reactivations);
1316                                 vm_object_page_grab_reactivations++;
1317                         }
1318                         vm_page_unlock_queues();
1319 move_page_in_obj:
1320                         queue_remove(&object->memq, p, vm_page_t, listq);
1321                         queue_enter(&object->memq, p, vm_page_t, listq);
1322
1323                         p_skipped++;
1324                         continue;
1325                 }
1326                 vm_page_lockspin_queues();
1327 take_page:
1328                 vm_page_free_prepare_queues(p);
1329                 vm_object_page_grab_returned++;
1330                 vm_object_page_grab_skipped += p_skipped;
1331
1332                 vm_page_unlock_queues();
1333
1334                 vm_page_free_prepare_object(p, TRUE);
1335
1336                 return (p);
1337         }
1338         vm_object_page_grab_skipped += p_skipped;
1339         vm_object_page_grab_failed++;
1340
1341         return (NULL);
1342 }
1343
1344
1345
1346 #define EVICT_PREPARE_LIMIT     64
1347 #define EVICT_AGE               10
1348
1349 static  clock_sec_t     vm_object_cache_aging_ts = 0;
1350
1351 static void
1352 vm_object_cache_remove_locked(
1353         vm_object_t     object)
1354 {
1355         queue_remove(&vm_object_cached_list, object, vm_object_t, objq);
1356         object->objq.next = NULL;
1357         object->objq.prev = NULL;
1358
1359         vm_object_cached_count--;
1360 }
1361
1362 void
1363 vm_object_cache_remove(
1364         vm_object_t     object)
1365 {
1366         vm_object_cache_lock_spin();
1367
1368         if (object->objq.next || object->objq.prev)
1369                 vm_object_cache_remove_locked(object);
1370
1371         vm_object_cache_unlock();
1372 }
1373
1374 void
1375 vm_object_cache_add(
1376         vm_object_t     object)
1377 {
1378         clock_sec_t sec;
1379         clock_nsec_t nsec;
1380
1381         if (object->resident_page_count == 0)
1382                 return;
1383         clock_get_system_nanotime(&sec, &nsec);
1384
1385         vm_object_cache_lock_spin();
1386
1387         if (object->objq.next == NULL && object->objq.prev == NULL) {
1388                 queue_enter(&vm_object_cached_list, object, vm_object_t, objq);
1389                 object->vo_cache_ts = sec + EVICT_AGE;
1390                 object->vo_cache_pages_to_scan = object->resident_page_count;
1391
1392                 vm_object_cached_count++;
1393                 vm_object_cache_adds++;
1394         }
1395         vm_object_cache_unlock();
1396 }
1397
1398 int
1399 vm_object_cache_evict(
1400         int     num_to_evict,
1401         int     max_objects_to_examine)
1402 {
1403         vm_object_t     object = VM_OBJECT_NULL;
1404         vm_object_t     next_obj = VM_OBJECT_NULL;
1405         vm_page_t       local_free_q = VM_PAGE_NULL;
1406         vm_page_t       p;
1407         vm_page_t       next_p;
1408         int             object_cnt = 0;
1409         vm_page_t       ep_array[EVICT_PREPARE_LIMIT];
1410         int             ep_count;
1411         int             ep_limit;
1412         int             ep_index;
1413         int             ep_freed = 0;
1414         int             ep_moved = 0;
1415         uint32_t        ep_skipped = 0;
1416         clock_sec_t     sec;
1417         clock_nsec_t    nsec;
1418
1419         KERNEL_DEBUG(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
1420         /*
1421          * do a couple of quick checks to see if it's
1422          * worthwhile grabbing the lock
1423          */
1424         if (queue_empty(&vm_object_cached_list)) {
1425                 KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, 0, 0, 0, 0, 0);
1426                 return (0);
1427         }
1428         clock_get_system_nanotime(&sec, &nsec);
1429
1430         /*
1431          * the object on the head of the queue has not
1432          * yet sufficiently aged
1433          */
1434         if (sec < vm_object_cache_aging_ts) {
1435                 KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, 0, 0, 0, 0, 0);
1436                 return (0);
1437         }
1438         /*
1439          * don't need the queue lock to find
1440          * and lock an object on the cached list
1441          */
1442         vm_page_unlock_queues();
1443
1444         vm_object_cache_lock_spin();
1445
1446         for (;;) {
1447                 next_obj = (vm_object_t)queue_first(&vm_object_cached_list);
1448
1449                 while (!queue_end(&vm_object_cached_list, (queue_entry_t)next_obj) && object_cnt++ < max_objects_to_examine) {
1450
1451                         object = next_obj;
1452                         next_obj = (vm_object_t)queue_next(&next_obj->objq);
1453
1454                         if (sec < object->vo_cache_ts) {
1455                                 KERNEL_DEBUG(0x130020c, object, object->resident_page_count, object->vo_cache_ts, sec, 0);
1456
1457                                 vm_object_cache_aging_ts = object->vo_cache_ts;
1458                                 object = VM_OBJECT_NULL;
1459                                 break;
1460                         }
1461                         if (!vm_object_lock_try_scan(object)) {
1462                                 /*
1463                                  * just skip over this guy for now... if we find
1464                                  * an object to steal pages from, we'll revist in a bit...
1465                                  * hopefully, the lock will have cleared
1466                                  */
1467                                 KERNEL_DEBUG(0x13001f8, object, object->resident_page_count, 0, 0, 0);
1468
1469                                 object = VM_OBJECT_NULL;
1470                                 continue;
1471                         }
1472                         if (queue_empty(&object->memq) || object->vo_cache_pages_to_scan == 0) {
1473                                 /*
1474                                  * this case really shouldn't happen, but it's not fatal
1475                                  * so deal with it... if we don't remove the object from
1476                                  * the list, we'll never move past it.
1477                                  */
1478                                 KERNEL_DEBUG(0x13001fc, object, object->resident_page_count, ep_freed, ep_moved, 0);
1479
1480                                 vm_object_cache_remove_locked(object);
1481                                 vm_object_unlock(object);
1482                                 object = VM_OBJECT_NULL;
1483                                 continue;
1484                         }
1485                         /*
1486                          * we have a locked object with pages...
1487                          * time to start harvesting
1488                          */
1489                         break;
1490                 }
1491                 vm_object_cache_unlock();
1492
1493                 if (object == VM_OBJECT_NULL)
1494                         break;
1495
1496                 /*
1497                  * object is locked at this point and
1498                  * has resident pages
1499                  */
1500                 next_p = (vm_page_t)queue_first(&object->memq);
1501
1502                 /*
1503                  * break the page scan into 2 pieces to minimize the time spent
1504                  * behind the page queue lock...
1505                  * the list of pages on these unused objects is likely to be cold
1506                  * w/r to the cpu cache which increases the time to scan the list
1507                  * tenfold...  and we may have a 'run' of pages we can't utilize that
1508                  * needs to be skipped over...
1509                  */
1510                 if ((ep_limit = num_to_evict - (ep_freed + ep_moved)) > EVICT_PREPARE_LIMIT)
1511                         ep_limit = EVICT_PREPARE_LIMIT;
1512                 ep_count = 0;
1513
1514                 while (!queue_end(&object->memq, (queue_entry_t)next_p) && object->vo_cache_pages_to_scan && ep_count < ep_limit) {
1515
1516                         p = next_p;
1517                         next_p = (vm_page_t)queue_next(&next_p->listq);
1518
1519                         object->vo_cache_pages_to_scan--;
1520
1521                         if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry) {
1522                                 queue_remove(&object->memq, p, vm_page_t, listq);
1523                                 queue_enter(&object->memq, p, vm_page_t, listq);
1524
1525                                 ep_skipped++;
1526                                 continue;
1527                         }
1528                         if (p->wpmapped || p->dirty || p->precious) {
1529                                 queue_remove(&object->memq, p, vm_page_t, listq);
1530                                 queue_enter(&object->memq, p, vm_page_t, listq);
1531
1532                                 pmap_clear_reference(p->phys_page);
1533                         }
1534                         ep_array[ep_count++] = p;
1535                 }
1536                 KERNEL_DEBUG(0x13001f4 | DBG_FUNC_START, object, object->resident_page_count, ep_freed, ep_moved, 0);
1537
1538                 vm_page_lockspin_queues();
1539
1540                 for (ep_index = 0; ep_index < ep_count; ep_index++) {
1541
1542                         p = ep_array[ep_index];
1543
1544                         if (p->wpmapped || p->dirty || p->precious) {
1545                                 p->reference = FALSE;
1546                                 p->no_cache = FALSE;
1547
1548                                 /*
1549                                  * we've already filtered out pages that are in the laundry
1550                                  * so if we get here, this page can't be on the pageout queue
1551                                  */
1552                                 assert(!p->pageout_queue);
1553
1554                                 VM_PAGE_QUEUES_REMOVE(p);
1555                                 VM_PAGE_ENQUEUE_INACTIVE(p, TRUE);
1556
1557                                 ep_moved++;
1558                         } else {
1559 #if CONFIG_PHANTOM_CACHE
1560                                 vm_phantom_cache_add_ghost(p);
1561 #endif
1562                                 vm_page_free_prepare_queues(p);
1563
1564                                 assert(p->pageq.next == NULL && p->pageq.prev == NULL);
1565                                 /*
1566                                  * Add this page to our list of reclaimed pages,
1567                                  * to be freed later.
1568                                  */
1569                                 p->pageq.next = (queue_entry_t) local_free_q;
1570                                 local_free_q = p;
1571
1572                                 ep_freed++;
1573                         }
1574                 }
1575                 vm_page_unlock_queues();
1576
1577                 KERNEL_DEBUG(0x13001f4 | DBG_FUNC_END, object, object->resident_page_count, ep_freed, ep_moved, 0);
1578
1579                 if (local_free_q) {
1580                         vm_page_free_list(local_free_q, TRUE);
1581                         local_free_q = VM_PAGE_NULL;
1582                 }
1583                 if (object->vo_cache_pages_to_scan == 0) {
1584                         KERNEL_DEBUG(0x1300208, object, object->resident_page_count, ep_freed, ep_moved, 0);
1585
1586                         vm_object_cache_remove(object);
1587
1588                         KERNEL_DEBUG(0x13001fc, object, object->resident_page_count, ep_freed, ep_moved, 0);
1589                 }
1590                 /*
1591                  * done with this object
1592                  */
1593                 vm_object_unlock(object);
1594                 object = VM_OBJECT_NULL;
1595
1596                 /*
1597                  * at this point, we are not holding any locks
1598                  */
1599                 if ((ep_freed + ep_moved) >= num_to_evict) {
1600                         /*
1601                          * we've reached our target for the
1602                          * number of pages to evict
1603                          */
1604                         break;
1605                 }
1606                 vm_object_cache_lock_spin();
1607         }
1608         /*
1609          * put the page queues lock back to the caller's
1610          * idea of it
1611          */
1612         vm_page_lock_queues();
1613
1614         vm_object_cache_pages_freed += ep_freed;
1615         vm_object_cache_pages_moved += ep_moved;
1616         vm_object_cache_pages_skipped += ep_skipped;
1617
1618         KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, ep_freed, 0, 0, 0, 0);
1619         return (ep_freed);
1620 }
1621
1622
1623 #if VM_OBJECT_CACHE
1624 /*
1625  *      Check to see whether we really need to trim
1626  *      down the cache. If so, remove an object from
1627  *      the cache, terminate it, and repeat.
1628  *
1629  *      Called with, and returns with, cache lock unlocked.
1630  */
1631 vm_object_t
1632 vm_object_cache_trim(
1633         boolean_t called_from_vm_object_deallocate)
1634 {
1635         register vm_object_t object = VM_OBJECT_NULL;
1636         vm_object_t shadow;
1637
1638         for (;;) {
1639
1640                 /*
1641                  *      If we no longer need to trim the cache,
1642                  *      then we are done.
1643                  */
1644                 if (vm_object_cached_count <= vm_object_cached_max)
1645                         return VM_OBJECT_NULL;
1646
1647                 vm_object_cache_lock();
1648                 if (vm_object_cached_count <= vm_object_cached_max) {
1649                         vm_object_cache_unlock();
1650                         return VM_OBJECT_NULL;
1651                 }
1652
1653                 /*
1654                  *      We must trim down the cache, so remove
1655                  *      the first object in the cache.
1656                  */
1657                 XPR(XPR_VM_OBJECT,
1658                 "vm_object_cache_trim: removing from front of cache (%x, %x)\n",
1659                         vm_object_cached_list.next,
1660                         vm_object_cached_list.prev, 0, 0, 0);
1661
1662                 object = (vm_object_t) queue_first(&vm_object_cached_list);
1663                 if(object == (vm_object_t) &vm_object_cached_list) {
1664                         /* something's wrong with the calling parameter or */
1665                         /* the value of vm_object_cached_count, just fix   */
1666                         /* and return */
1667                         if(vm_object_cached_max < 0)
1668                                 vm_object_cached_max = 0;
1669                         vm_object_cached_count = 0;
1670                         vm_object_cache_unlock();
1671                         return VM_OBJECT_NULL;
1672                 }
1673                 vm_object_lock(object);
1674                 queue_remove(&vm_object_cached_list, object, vm_object_t,
1675                              cached_list);
1676                 vm_object_cached_count--;
1677
1678                 vm_object_cache_unlock();
1679                 /*
1680                  *      Since this object is in the cache, we know
1681                  *      that it is initialized and has no references.
1682                  *      Take a reference to avoid recursive deallocations.
1683                  */
1684
1685                 assert(object->pager_initialized);
1686                 assert(object->ref_count == 0);
1687                 vm_object_lock_assert_exclusive(object);
1688                 object->ref_count++;
1689
1690                 /*
1691                  *      Terminate the object.
1692                  *      If the object had a shadow, we let vm_object_deallocate
1693                  *      deallocate it. "pageout" objects have a shadow, but
1694                  *      maintain a "paging reference" rather than a normal
1695                  *      reference.
1696                  *      (We are careful here to limit recursion.)
1697                  */
1698                 shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
1699
1700                 if(vm_object_terminate(object) != KERN_SUCCESS)
1701                         continue;
1702
1703                 if (shadow != VM_OBJECT_NULL) {
1704                         if (called_from_vm_object_deallocate) {
1705                                 return shadow;
1706                         } else {
1707                                 vm_object_deallocate(shadow);
1708                         }
1709                 }
1710         }
1711 }
1712 #endif
1713
1714
1715 /*
1716  *      Routine:        vm_object_terminate
1717  *      Purpose:
1718  *              Free all resources associated with a vm_object.
1719  *      In/out conditions:
1720  *              Upon entry, the object must be locked,
1721  *              and the object must have exactly one reference.
1722  *
1723  *              The shadow object reference is left alone.
1724  *
1725  *              The object must be unlocked if its found that pages
1726  *              must be flushed to a backing object.  If someone
1727  *              manages to map the object while it is being flushed
1728  *              the object is returned unlocked and unchanged.  Otherwise,
1729  *              upon exit, the cache will be unlocked, and the
1730  *              object will cease to exist.
1731  */
1732 static kern_return_t
1733 vm_object_terminate(
1734         vm_object_t     object)
1735 {
1736         vm_object_t     shadow_object;
1737
1738         XPR(XPR_VM_OBJECT, "vm_object_terminate, object 0x%X ref %d\n",
1739                 object, object->ref_count, 0, 0, 0);
1740
1741         if (!object->pageout && (!object->temporary || object->can_persist) &&
1742             (object->pager != NULL || object->shadow_severed)) {
1743                 /*
1744                  * Clear pager_trusted bit so that the pages get yanked
1745                  * out of the object instead of cleaned in place.  This
1746                  * prevents a deadlock in XMM and makes more sense anyway.
1747                  */
1748                 object->pager_trusted = FALSE;
1749
1750                 vm_object_reap_pages(object, REAP_TERMINATE);
1751         }
1752         /*
1753          *      Make sure the object isn't already being terminated
1754          */
1755         if (object->terminating) {
1756                 vm_object_lock_assert_exclusive(object);
1757                 object->ref_count--;
1758                 assert(object->ref_count > 0);
1759                 vm_object_unlock(object);
1760                 return KERN_FAILURE;
1761         }
1762
1763         /*
1764          * Did somebody get a reference to the object while we were
1765          * cleaning it?
1766          */
1767         if (object->ref_count != 1) {
1768                 vm_object_lock_assert_exclusive(object);
1769                 object->ref_count--;
1770                 assert(object->ref_count > 0);
1771                 vm_object_res_deallocate(object);
1772                 vm_object_unlock(object);
1773                 return KERN_FAILURE;
1774         }
1775
1776         /*
1777          *      Make sure no one can look us up now.
1778          */
1779
1780         object->terminating = TRUE;
1781         object->alive = FALSE;
1782
1783         if ( !object->internal && (object->objq.next || object->objq.prev))
1784                 vm_object_cache_remove(object);
1785
1786         if (object->hashed) {
1787                 lck_mtx_t       *lck;
1788
1789                 lck = vm_object_hash_lock_spin(object->pager);
1790                 vm_object_remove(object);
1791                 vm_object_hash_unlock(lck);
1792         }
1793         /*
1794          *      Detach the object from its shadow if we are the shadow's
1795          *      copy. The reference we hold on the shadow must be dropped
1796          *      by our caller.
1797          */
1798         if (((shadow_object = object->shadow) != VM_OBJECT_NULL) &&
1799             !(object->pageout)) {
1800                 vm_object_lock(shadow_object);
1801                 if (shadow_object->copy == object)
1802                         shadow_object->copy = VM_OBJECT_NULL;
1803                 vm_object_unlock(shadow_object);
1804         }
1805
1806         if (object->paging_in_progress != 0 ||
1807             object->activity_in_progress != 0) {
1808                 /*
1809                  * There are still some paging_in_progress references
1810                  * on this object, meaning that there are some paging
1811                  * or other I/O operations in progress for this VM object.
1812                  * Such operations take some paging_in_progress references
1813                  * up front to ensure that the object doesn't go away, but
1814                  * they may also need to acquire a reference on the VM object,
1815                  * to map it in kernel space, for example.  That means that
1816                  * they may end up releasing the last reference on the VM
1817                  * object, triggering its termination, while still holding
1818                  * paging_in_progress references.  Waiting for these
1819                  * pending paging_in_progress references to go away here would
1820                  * deadlock.
1821                  *
1822                  * To avoid deadlocking, we'll let the vm_object_reaper_thread
1823                  * complete the VM object termination if it still holds
1824                  * paging_in_progress references at this point.
1825                  *
1826                  * No new paging_in_progress should appear now that the
1827                  * VM object is "terminating" and not "alive".
1828                  */
1829                 vm_object_reap_async(object);
1830                 vm_object_unlock(object);
1831                 /*
1832                  * Return KERN_FAILURE to let the caller know that we
1833                  * haven't completed the termination and it can't drop this
1834                  * object's reference on its shadow object yet.
1835                  * The reaper thread will take care of that once it has
1836                  * completed this object's termination.
1837                  */
1838                 return KERN_FAILURE;
1839         }
1840         /*
1841          * complete the VM object termination
1842          */
1843         vm_object_reap(object);
1844         object = VM_OBJECT_NULL;
1845
1846         /*
1847          * the object lock was released by vm_object_reap()
1848          *
1849          * KERN_SUCCESS means that this object has been terminated
1850          * and no longer needs its shadow object but still holds a
1851          * reference on it.
1852          * The caller is responsible for dropping that reference.
1853          * We can't call vm_object_deallocate() here because that
1854          * would create a recursion.
1855          */
1856         return KERN_SUCCESS;
1857 }
1858
1859
1860 /*
1861  * vm_object_reap():
1862  *
1863  * Complete the termination of a VM object after it's been marked
1864  * as "terminating" and "!alive" by vm_object_terminate().
1865  *
1866  * The VM object must be locked by caller.
1867  * The lock will be released on return and the VM object is no longer valid.
1868  */
1869 void
1870 vm_object_reap(
1871         vm_object_t object)
1872 {
1873         memory_object_t         pager;
1874
1875         vm_object_lock_assert_exclusive(object);
1876         assert(object->paging_in_progress == 0);
1877         assert(object->activity_in_progress == 0);
1878
1879         vm_object_reap_count++;
1880
1881         /*
1882          * Disown this purgeable object to cleanup its owner's purgeable
1883          * ledgers.  We need to do this before disconnecting the object
1884          * from its pager, to properly account for compressed pages.
1885          */
1886         if (object->internal &&
1887             object->purgable != VM_PURGABLE_DENY) {
1888                 vm_purgeable_accounting(object,
1889                                         object->purgable,
1890                                         TRUE); /* disown */
1891         }
1892
1893         pager = object->pager;
1894         object->pager = MEMORY_OBJECT_NULL;
1895
1896         if (pager != MEMORY_OBJECT_NULL)
1897                 memory_object_control_disable(object->pager_control);
1898
1899         object->ref_count--;
1900 #if     TASK_SWAPPER
1901         assert(object->res_count == 0);
1902 #endif  /* TASK_SWAPPER */
1903
1904         assert (object->ref_count == 0);
1905
1906         /*
1907          * remove from purgeable queue if it's on
1908          */
1909         if (object->internal) {
1910                 task_t owner;
1911
1912                 owner = object->vo_purgeable_owner;
1913
1914                 if (object->purgable == VM_PURGABLE_DENY) {
1915                         /* not purgeable: nothing to do */
1916                 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
1917                         purgeable_q_t queue;
1918
1919                         assert(object->vo_purgeable_owner == NULL);
1920
1921                         queue = vm_purgeable_object_remove(object);
1922                         assert(queue);
1923
1924                         if (object->purgeable_when_ripe) {
1925                                 /*
1926                                  * Must take page lock for this -
1927                                  * using it to protect token queue
1928                                  */
1929                                 vm_page_lock_queues();
1930                                 vm_purgeable_token_delete_first(queue);
1931
1932                                 assert(queue->debug_count_objects>=0);
1933                                 vm_page_unlock_queues();
1934                         }
1935
1936                         /*
1937                          * Update "vm_page_purgeable_count" in bulk and mark
1938                          * object as VM_PURGABLE_EMPTY to avoid updating
1939                          * "vm_page_purgeable_count" again in vm_page_remove()
1940                          * when reaping the pages.
1941                          */
1942                         unsigned int delta;
1943                         assert(object->resident_page_count >=
1944                                object->wired_page_count);
1945                         delta = (object->resident_page_count -
1946                                  object->wired_page_count);
1947                         if (delta != 0) {
1948                                 assert(vm_page_purgeable_count >= delta);
1949                                 OSAddAtomic(-delta,
1950                                             (SInt32 *)&vm_page_purgeable_count);
1951                         }
1952                         if (object->wired_page_count != 0) {
1953                                 assert(vm_page_purgeable_wired_count >=
1954                                        object->wired_page_count);
1955                                 OSAddAtomic(-object->wired_page_count,
1956                                             (SInt32 *)&vm_page_purgeable_wired_count);
1957                         }
1958                         object->purgable = VM_PURGABLE_EMPTY;
1959                 }
1960                 else if (object->purgable == VM_PURGABLE_NONVOLATILE ||
1961                          object->purgable == VM_PURGABLE_EMPTY) {
1962                         /* remove from nonvolatile queue */
1963                         assert(object->vo_purgeable_owner == TASK_NULL);
1964                         vm_purgeable_nonvolatile_dequeue(object);
1965                 } else {
1966                         panic("object %p in unexpected purgeable state 0x%x\n",
1967                               object, object->purgable);
1968                 }
1969                 assert(object->objq.next == NULL);
1970                 assert(object->objq.prev == NULL);
1971         }
1972
1973         /*
1974          *      Clean or free the pages, as appropriate.
1975          *      It is possible for us to find busy/absent pages,
1976          *      if some faults on this object were aborted.
1977          */
1978         if (object->pageout) {
1979                 assert(object->shadow != VM_OBJECT_NULL);
1980
1981                 vm_pageout_object_terminate(object);
1982
1983         } else if (((object->temporary && !object->can_persist) || (pager == MEMORY_OBJECT_NULL))) {
1984
1985                 vm_object_reap_pages(object, REAP_REAP);
1986         }
1987         assert(queue_empty(&object->memq));
1988         assert(object->paging_in_progress == 0);
1989         assert(object->activity_in_progress == 0);
1990         assert(object->ref_count == 0);
1991
1992         /*
1993          * If the pager has not already been released by
1994          * vm_object_destroy, we need to terminate it and
1995          * release our reference to it here.
1996          */
1997         if (pager != MEMORY_OBJECT_NULL) {
1998                 vm_object_unlock(object);
1999                 vm_object_release_pager(pager, object->hashed);
2000                 vm_object_lock(object);
2001         }
2002
2003         /* kick off anyone waiting on terminating */
2004         object->terminating = FALSE;
2005         vm_object_paging_begin(object);
2006         vm_object_paging_end(object);
2007         vm_object_unlock(object);
2008
2009 #if     MACH_PAGEMAP
2010         vm_external_destroy(object->existence_map, object->vo_size);
2011 #endif  /* MACH_PAGEMAP */
2012
2013         object->shadow = VM_OBJECT_NULL;
2014
2015 #if VM_OBJECT_TRACKING
2016         if (vm_object_tracking_inited) {
2017                 btlog_remove_entries_for_element(vm_object_tracking_btlog,
2018                                                  object);
2019         }
2020 #endif /* VM_OBJECT_TRACKING */
2021
2022         vm_object_lock_destroy(object);
2023         /*
2024          *      Free the space for the object.
2025          */
2026         zfree(vm_object_zone, object);
2027         object = VM_OBJECT_NULL;
2028 }
2029
2030
2031 unsigned int vm_max_batch = 256;
2032
2033 #define V_O_R_MAX_BATCH 128
2034
2035 #define BATCH_LIMIT(max)        (vm_max_batch >= max ? max : vm_max_batch)
2036
2037
2038 #define VM_OBJ_REAP_FREELIST(_local_free_q, do_disconnect)              \
2039         MACRO_BEGIN                                                     \
2040         if (_local_free_q) {                                            \
2041                 if (do_disconnect) {                                    \
2042                         vm_page_t m;                                    \
2043                         for (m = _local_free_q;                         \
2044                              m != VM_PAGE_NULL;                         \
2045                              m = (vm_page_t) m->pageq.next) {           \
2046                                 if (m->pmapped) {                       \
2047                                         pmap_disconnect(m->phys_page);  \
2048                                 }                                       \
2049                         }                                               \
2050                 }                                                       \
2051                 vm_page_free_list(_local_free_q, TRUE);                 \
2052                 _local_free_q = VM_PAGE_NULL;                           \
2053         }                                                               \
2054         MACRO_END
2055
2056
2057 void
2058 vm_object_reap_pages(
2059         vm_object_t     object,
2060         int             reap_type)
2061 {
2062         vm_page_t       p;
2063         vm_page_t       next;
2064         vm_page_t       local_free_q = VM_PAGE_NULL;
2065         int             loop_count;
2066         boolean_t       disconnect_on_release;
2067         pmap_flush_context      pmap_flush_context_storage;
2068
2069         if (reap_type == REAP_DATA_FLUSH) {
2070                 /*
2071                  * We need to disconnect pages from all pmaps before
2072                  * releasing them to the free list
2073                  */
2074                 disconnect_on_release = TRUE;
2075         } else {
2076                 /*
2077                  * Either the caller has already disconnected the pages
2078                  * from all pmaps, or we disconnect them here as we add
2079                  * them to out local list of pages to be released.
2080                  * No need to re-disconnect them when we release the pages
2081                  * to the free list.
2082                  */
2083                 disconnect_on_release = FALSE;
2084         }
2085
2086 restart_after_sleep:
2087         if (queue_empty(&object->memq))
2088                 return;
2089         loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH);
2090
2091         if (reap_type == REAP_PURGEABLE)
2092                 pmap_flush_context_init(&pmap_flush_context_storage);
2093
2094         vm_page_lockspin_queues();
2095
2096         next = (vm_page_t)queue_first(&object->memq);
2097
2098         while (!queue_end(&object->memq, (queue_entry_t)next)) {
2099
2100                 p = next;
2101                 next = (vm_page_t)queue_next(&next->listq);
2102
2103                 if (--loop_count == 0) {
2104
2105                         vm_page_unlock_queues();
2106
2107                         if (local_free_q) {
2108
2109                                 if (reap_type == REAP_PURGEABLE) {
2110                                         pmap_flush(&pmap_flush_context_storage);
2111                                         pmap_flush_context_init(&pmap_flush_context_storage);
2112                                 }
2113                                 /*
2114                                  * Free the pages we reclaimed so far
2115                                  * and take a little break to avoid
2116                                  * hogging the page queue lock too long
2117                                  */
2118                                 VM_OBJ_REAP_FREELIST(local_free_q,
2119                                                      disconnect_on_release);
2120                         } else
2121                                 mutex_pause(0);
2122
2123                         loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH);
2124
2125                         vm_page_lockspin_queues();
2126                 }
2127                 if (reap_type == REAP_DATA_FLUSH || reap_type == REAP_TERMINATE) {
2128
2129                         if (p->busy || p->cleaning) {
2130
2131                                 vm_page_unlock_queues();
2132                                 /*
2133                                  * free the pages reclaimed so far
2134                                  */
2135                                 VM_OBJ_REAP_FREELIST(local_free_q,
2136                                                      disconnect_on_release);
2137
2138                                 PAGE_SLEEP(object, p, THREAD_UNINT);
2139
2140                                 goto restart_after_sleep;
2141                         }
2142                         if (p->laundry) {
2143                                 p->pageout = FALSE;
2144
2145                                 vm_pageout_steal_laundry(p, TRUE);
2146                         }
2147                 }
2148                 switch (reap_type) {
2149
2150                 case REAP_DATA_FLUSH:
2151                         if (VM_PAGE_WIRED(p)) {
2152                                 /*
2153                                  * this is an odd case... perhaps we should
2154                                  * zero-fill this page since we're conceptually
2155                                  * tossing its data at this point, but leaving
2156                                  * it on the object to honor the 'wire' contract
2157                                  */
2158                                 continue;
2159                         }
2160                         break;
2161
2162                 case REAP_PURGEABLE:
2163                         if (VM_PAGE_WIRED(p)) {
2164                                 /*
2165                                  * can't purge a wired page
2166                                  */
2167                                 vm_page_purged_wired++;
2168                                 continue;
2169                         }
2170                         if (p->laundry && !p->busy && !p->cleaning) {
2171                                 p->pageout = FALSE;
2172
2173                                 vm_pageout_steal_laundry(p, TRUE);
2174                         }
2175                         if (p->cleaning || p->laundry || p->absent) {
2176                                 /*
2177                                  * page is being acted upon,
2178                                  * so don't mess with it
2179                                  */
2180                                 vm_page_purged_others++;
2181                                 continue;
2182                         }
2183                         if (p->busy) {
2184                                 /*
2185                                  * We can't reclaim a busy page but we can
2186                                  * make it more likely to be paged (it's not wired) to make
2187                                  * sure that it gets considered by
2188                                  * vm_pageout_scan() later.
2189                                  */
2190                                 vm_page_deactivate(p);
2191                                 vm_page_purged_busy++;
2192                                 continue;
2193                         }
2194
2195                         assert(p->object != kernel_object);
2196
2197                         /*
2198                          * we can discard this page...
2199                          */
2200                         if (p->pmapped == TRUE) {
2201                                 /*
2202                                  * unmap the page
2203                                  */
2204                                 pmap_disconnect_options(p->phys_page, PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_NOREFMOD, (void *)&pmap_flush_context_storage);
2205                         }
2206                         vm_page_purged_count++;
2207
2208                         break;
2209
2210                 case REAP_TERMINATE:
2211                         if (p->absent || p->private) {
2212                                 /*
2213                                  *      For private pages, VM_PAGE_FREE just
2214                                  *      leaves the page structure around for
2215                                  *      its owner to clean up.  For absent
2216                                  *      pages, the structure is returned to
2217                                  *      the appropriate pool.
2218                                  */
2219                                 break;
2220                         }
2221                         if (p->fictitious) {
2222                                 assert (p->phys_page == vm_page_guard_addr);
2223                                 break;
2224                         }
2225                         if (!p->dirty && p->wpmapped)
2226                                 p->dirty = pmap_is_modified(p->phys_page);
2227
2228                         if ((p->dirty || p->precious) && !p->error && object->alive) {
2229
2230                                 if (!p->laundry) {
2231                                         VM_PAGE_QUEUES_REMOVE(p);
2232                                         /*
2233                                          * flush page... page will be freed
2234                                          * upon completion of I/O
2235                                          */
2236                                         vm_pageout_cluster(p, TRUE);
2237                                 }
2238                                 vm_page_unlock_queues();
2239                                 /*
2240                                  * free the pages reclaimed so far
2241                                  */
2242                                 VM_OBJ_REAP_FREELIST(local_free_q,
2243                                                      disconnect_on_release);
2244
2245                                 vm_object_paging_wait(object, THREAD_UNINT);
2246
2247                                 goto restart_after_sleep;
2248                         }
2249                         break;
2250
2251                 case REAP_REAP:
2252                         break;
2253                 }
2254                 vm_page_free_prepare_queues(p);
2255                 assert(p->pageq.next == NULL && p->pageq.prev == NULL);
2256                 /*
2257                  * Add this page to our list of reclaimed pages,
2258                  * to be freed later.
2259                  */
2260                 p->pageq.next = (queue_entry_t) local_free_q;
2261                 local_free_q = p;
2262         }
2263         vm_page_unlock_queues();
2264
2265         /*
2266          * Free the remaining reclaimed pages
2267          */
2268         if (reap_type == REAP_PURGEABLE)
2269                 pmap_flush(&pmap_flush_context_storage);
2270
2271         VM_OBJ_REAP_FREELIST(local_free_q,
2272                              disconnect_on_release);
2273 }
2274
2275
2276 void
2277 vm_object_reap_async(
2278         vm_object_t     object)
2279 {
2280         vm_object_lock_assert_exclusive(object);
2281
2282         vm_object_reaper_lock_spin();
2283
2284         vm_object_reap_count_async++;
2285
2286         /* enqueue the VM object... */
2287         queue_enter(&vm_object_reaper_queue, object,
2288                     vm_object_t, cached_list);
2289
2290         vm_object_reaper_unlock();
2291
2292         /* ... and wake up the reaper thread */
2293         thread_wakeup((event_t) &vm_object_reaper_queue);
2294 }
2295
2296
2297 void
2298 vm_object_reaper_thread(void)
2299 {
2300         vm_object_t     object, shadow_object;
2301
2302         vm_object_reaper_lock_spin();
2303
2304         while (!queue_empty(&vm_object_reaper_queue)) {
2305                 queue_remove_first(&vm_object_reaper_queue,
2306                                    object,
2307                                    vm_object_t,
2308                                    cached_list);
2309
2310                 vm_object_reaper_unlock();
2311                 vm_object_lock(object);
2312
2313                 assert(object->terminating);
2314                 assert(!object->alive);
2315
2316                 /*
2317                  * The pageout daemon might be playing with our pages.
2318                  * Now that the object is dead, it won't touch any more
2319                  * pages, but some pages might already be on their way out.
2320                  * Hence, we wait until the active paging activities have
2321                  * ceased before we break the association with the pager
2322                  * itself.
2323                  */
2324                 while (object->paging_in_progress != 0 ||
2325                         object->activity_in_progress != 0) {
2326                         vm_object_wait(object,
2327                                        VM_OBJECT_EVENT_PAGING_IN_PROGRESS,
2328                                        THREAD_UNINT);
2329                         vm_object_lock(object);
2330                 }
2331
2332                 shadow_object =
2333                         object->pageout ? VM_OBJECT_NULL : object->shadow;
2334
2335                 vm_object_reap(object);
2336                 /* cache is unlocked and object is no longer valid */
2337                 object = VM_OBJECT_NULL;
2338
2339                 if (shadow_object != VM_OBJECT_NULL) {
2340                         /*
2341                          * Drop the reference "object" was holding on
2342                          * its shadow object.
2343                          */
2344                         vm_object_deallocate(shadow_object);
2345                         shadow_object = VM_OBJECT_NULL;
2346                 }
2347                 vm_object_reaper_lock_spin();
2348         }
2349
2350         /* wait for more work... */
2351         assert_wait((event_t) &vm_object_reaper_queue, THREAD_UNINT);
2352
2353         vm_object_reaper_unlock();
2354
2355         thread_block((thread_continue_t) vm_object_reaper_thread);
2356         /*NOTREACHED*/
2357 }
2358
2359 /*
2360  *      Routine:        vm_object_pager_wakeup
2361  *      Purpose:        Wake up anyone waiting for termination of a pager.
2362  */
2363
2364 static void
2365 vm_object_pager_wakeup(
2366         memory_object_t pager)
2367 {
2368         vm_object_hash_entry_t  entry;
2369         boolean_t               waiting = FALSE;
2370         lck_mtx_t               *lck;
2371
2372         /*
2373          *      If anyone was waiting for the memory_object_terminate
2374          *      to be queued, wake them up now.
2375          */
2376         lck = vm_object_hash_lock_spin(pager);
2377         entry = vm_object_hash_lookup(pager, TRUE);
2378         if (entry != VM_OBJECT_HASH_ENTRY_NULL)
2379                 waiting = entry->waiting;
2380         vm_object_hash_unlock(lck);
2381
2382         if (entry != VM_OBJECT_HASH_ENTRY_NULL) {
2383                 if (waiting)
2384                         thread_wakeup((event_t) pager);
2385                 vm_object_hash_entry_free(entry);
2386         }
2387 }
2388
2389 /*
2390  *      Routine:        vm_object_release_pager
2391  *      Purpose:        Terminate the pager and, upon completion,
2392  *                      release our last reference to it.
2393  *                      just like memory_object_terminate, except
2394  *                      that we wake up anyone blocked in vm_object_enter
2395  *                      waiting for termination message to be queued
2396  *                      before calling memory_object_init.
2397  */
2398 static void
2399 vm_object_release_pager(
2400         memory_object_t pager,
2401         boolean_t       hashed)
2402 {
2403
2404         /*
2405          *      Terminate the pager.
2406          */
2407
2408         (void) memory_object_terminate(pager);
2409
2410         if (hashed == TRUE) {
2411                 /*
2412                  *      Wakeup anyone waiting for this terminate
2413                  *      and remove the entry from the hash
2414                  */
2415                 vm_object_pager_wakeup(pager);
2416         }
2417         /*
2418          *      Release reference to pager.
2419          */
2420         memory_object_deallocate(pager);
2421 }
2422
2423 /*
2424  *      Routine:        vm_object_destroy
2425  *      Purpose:
2426  *              Shut down a VM object, despite the
2427  *              presence of address map (or other) references
2428  *              to the vm_object.
2429  */
2430 kern_return_t
2431 vm_object_destroy(
2432         vm_object_t             object,
2433         __unused kern_return_t          reason)
2434 {
2435         memory_object_t         old_pager;
2436
2437         if (object == VM_OBJECT_NULL)
2438                 return(KERN_SUCCESS);
2439
2440         /*
2441          *      Remove the pager association immediately.
2442          *
2443          *      This will prevent the memory manager from further
2444          *      meddling.  [If it wanted to flush data or make
2445          *      other changes, it should have done so before performing
2446          *      the destroy call.]
2447          */
2448
2449         vm_object_lock(object);
2450         object->can_persist = FALSE;
2451         object->named = FALSE;
2452         object->alive = FALSE;
2453
2454         if (object->hashed) {
2455                 lck_mtx_t       *lck;
2456                 /*
2457                  *      Rip out the pager from the vm_object now...
2458                  */
2459                 lck = vm_object_hash_lock_spin(object->pager);
2460                 vm_object_remove(object);
2461                 vm_object_hash_unlock(lck);
2462         }
2463         old_pager = object->pager;
2464         object->pager = MEMORY_OBJECT_NULL;
2465         if (old_pager != MEMORY_OBJECT_NULL)
2466                 memory_object_control_disable(object->pager_control);
2467
2468         /*
2469          * Wait for the existing paging activity (that got
2470          * through before we nulled out the pager) to subside.
2471          */
2472
2473         vm_object_paging_wait(object, THREAD_UNINT);
2474         vm_object_unlock(object);
2475
2476         /*
2477          *      Terminate the object now.
2478          */
2479         if (old_pager != MEMORY_OBJECT_NULL) {
2480                 vm_object_release_pager(old_pager, object->hashed);
2481
2482                 /*
2483                  * JMM - Release the caller's reference.  This assumes the
2484                  * caller had a reference to release, which is a big (but
2485                  * currently valid) assumption if this is driven from the
2486                  * vnode pager (it is holding a named reference when making
2487                  * this call)..
2488                  */
2489                 vm_object_deallocate(object);
2490
2491         }
2492         return(KERN_SUCCESS);
2493 }
2494
2495
2496 #if VM_OBJECT_CACHE
2497
2498 #define VM_OBJ_DEACT_ALL_STATS DEBUG
2499 #if VM_OBJ_DEACT_ALL_STATS
2500 uint32_t vm_object_deactivate_all_pages_batches = 0;
2501 uint32_t vm_object_deactivate_all_pages_pages = 0;
2502 #endif /* VM_OBJ_DEACT_ALL_STATS */
2503 /*
2504  *      vm_object_deactivate_all_pages
2505  *
2506  *      Deactivate all pages in the specified object.  (Keep its pages
2507  *      in memory even though it is no longer referenced.)
2508  *
2509  *      The object must be locked.
2510  */
2511 static void
2512 vm_object_deactivate_all_pages(
2513         register vm_object_t    object)
2514 {
2515         register vm_page_t      p;
2516         int                     loop_count;
2517 #if VM_OBJ_DEACT_ALL_STATS
2518         int                     pages_count;
2519 #endif /* VM_OBJ_DEACT_ALL_STATS */
2520 #define V_O_D_A_P_MAX_BATCH     256
2521
2522         loop_count = BATCH_LIMIT(V_O_D_A_P_MAX_BATCH);
2523 #if VM_OBJ_DEACT_ALL_STATS
2524         pages_count = 0;
2525 #endif /* VM_OBJ_DEACT_ALL_STATS */
2526         vm_page_lock_queues();
2527         queue_iterate(&object->memq, p, vm_page_t, listq) {
2528                 if (--loop_count == 0) {
2529 #if VM_OBJ_DEACT_ALL_STATS
2530                         hw_atomic_add(&vm_object_deactivate_all_pages_batches,
2531                                       1);
2532                         hw_atomic_add(&vm_object_deactivate_all_pages_pages,
2533                                       pages_count);
2534                         pages_count = 0;
2535 #endif /* VM_OBJ_DEACT_ALL_STATS */
2536                         lck_mtx_yield(&vm_page_queue_lock);
2537                         loop_count = BATCH_LIMIT(V_O_D_A_P_MAX_BATCH);
2538                 }
2539                 if (!p->busy && !p->throttled) {
2540 #if VM_OBJ_DEACT_ALL_STATS
2541                         pages_count++;
2542 #endif /* VM_OBJ_DEACT_ALL_STATS */
2543                         vm_page_deactivate(p);
2544                 }
2545         }
2546 #if VM_OBJ_DEACT_ALL_STATS
2547         if (pages_count) {
2548                 hw_atomic_add(&vm_object_deactivate_all_pages_batches, 1);
2549                 hw_atomic_add(&vm_object_deactivate_all_pages_pages,
2550                               pages_count);
2551                 pages_count = 0;
2552         }
2553 #endif /* VM_OBJ_DEACT_ALL_STATS */
2554         vm_page_unlock_queues();
2555 }
2556 #endif  /* VM_OBJECT_CACHE */
2557
2558
2559
2560 /*
2561  * The "chunk" macros are used by routines below when looking for pages to deactivate.  These
2562  * exist because of the need to handle shadow chains.  When deactivating pages, we only
2563  * want to deactive the ones at the top most level in the object chain.  In order to do
2564  * this efficiently, the specified address range is divided up into "chunks" and we use
2565  * a bit map to keep track of which pages have already been processed as we descend down
2566  * the shadow chain.  These chunk macros hide the details of the bit map implementation
2567  * as much as we can.
2568  *
2569  * For convenience, we use a 64-bit data type as the bit map, and therefore a chunk is
2570  * set to 64 pages.  The bit map is indexed from the low-order end, so that the lowest
2571  * order bit represents page 0 in the current range and highest order bit represents
2572  * page 63.
2573  *
2574  * For further convenience, we also use negative logic for the page state in the bit map.
2575  * The bit is set to 1 to indicate it has not yet been seen, and to 0 to indicate it has
2576  * been processed.  This way we can simply test the 64-bit long word to see if it's zero
2577  * to easily tell if the whole range has been processed.  Therefore, the bit map starts
2578  * out with all the bits set.  The macros below hide all these details from the caller.
2579  */
2580
2581 #define PAGES_IN_A_CHUNK        64      /* The number of pages in the chunk must */
2582                                         /* be the same as the number of bits in  */
2583                                         /* the chunk_state_t type. We use 64     */
2584                                         /* just for convenience.                 */
2585
2586 #define CHUNK_SIZE      (PAGES_IN_A_CHUNK * PAGE_SIZE_64)       /* Size of a chunk in bytes */
2587
2588 typedef uint64_t        chunk_state_t;
2589
2590 /*
2591  * The bit map uses negative logic, so we start out with all 64 bits set to indicate
2592  * that no pages have been processed yet.  Also, if len is less than the full CHUNK_SIZE,
2593  * then we mark pages beyond the len as having been "processed" so that we don't waste time
2594  * looking at pages in that range.  This can save us from unnecessarily chasing down the
2595  * shadow chain.
2596  */
2597
2598 #define CHUNK_INIT(c, len)                                              \
2599         MACRO_BEGIN                                                     \
2600         uint64_t p;                                                     \
2601                                                                         \
2602         (c) = 0xffffffffffffffffLL;                                     \
2603                                                                         \
2604         for (p = (len) / PAGE_SIZE_64; p < PAGES_IN_A_CHUNK; p++)       \
2605                 MARK_PAGE_HANDLED(c, p);                                \
2606         MACRO_END
2607
2608
2609 /*
2610  * Return true if all pages in the chunk have not yet been processed.
2611  */
2612
2613 #define CHUNK_NOT_COMPLETE(c)   ((c) != 0)
2614
2615 /*
2616  * Return true if the page at offset 'p' in the bit map has already been handled
2617  * while processing a higher level object in the shadow chain.
2618  */
2619
2620 #define PAGE_ALREADY_HANDLED(c, p)      (((c) & (1LL << (p))) == 0)
2621
2622 /*
2623  * Mark the page at offset 'p' in the bit map as having been processed.
2624  */
2625
2626 #define MARK_PAGE_HANDLED(c, p) \
2627 MACRO_BEGIN \
2628         (c) = (c) & ~(1LL << (p)); \
2629 MACRO_END
2630
2631
2632 /*
2633  * Return true if the page at the given offset has been paged out.  Object is
2634  * locked upon entry and returned locked.
2635  */
2636
2637 static boolean_t
2638 page_is_paged_out(
2639         vm_object_t             object,
2640         vm_object_offset_t      offset)
2641 {
2642         kern_return_t   kr;
2643         memory_object_t pager;
2644
2645         /*
2646          * Check the existence map for the page if we have one, otherwise
2647          * ask the pager about this page.
2648          */
2649
2650 #if MACH_PAGEMAP
2651         if (object->existence_map) {
2652                 if (vm_external_state_get(object->existence_map, offset)
2653                     == VM_EXTERNAL_STATE_EXISTS) {
2654                         /*
2655                          * We found the page
2656                          */
2657
2658                         return TRUE;
2659                 }
2660         } else
2661 #endif /* MACH_PAGEMAP */
2662         if (object->internal &&
2663            object->alive &&
2664            !object->terminating &&
2665            object->pager_ready) {
2666
2667                 if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
2668                         if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
2669                             == VM_EXTERNAL_STATE_EXISTS) {
2670                                 return TRUE;
2671                         } else {
2672                                 return FALSE;
2673                         }
2674                 }
2675
2676                 /*
2677                  * We're already holding a "paging in progress" reference
2678                  * so the object can't disappear when we release the lock.
2679                  */
2680
2681                 assert(object->paging_in_progress);
2682                 pager = object->pager;
2683                 vm_object_unlock(object);
2684
2685                 kr = memory_object_data_request(
2686                         pager,
2687                         offset + object->paging_offset,
2688                         0,      /* just poke the pager */
2689                         VM_PROT_READ,
2690                         NULL);
2691
2692                 vm_object_lock(object);
2693
2694                 if (kr == KERN_SUCCESS) {
2695
2696                         /*
2697                          * We found the page
2698                          */
2699
2700                         return TRUE;
2701                 }
2702         }
2703
2704         return FALSE;
2705 }
2706
2707
2708
2709 /*
2710  * madvise_free_debug
2711  *
2712  * To help debug madvise(MADV_FREE*) mis-usage, this triggers a
2713  * zero-fill as soon as a page is affected by a madvise(MADV_FREE*), to
2714  * simulate the loss of the page's contents as if the page had been
2715  * reclaimed and then re-faulted.
2716  */
2717 #if DEVELOPMENT || DEBUG
2718 int madvise_free_debug = 1;
2719 #else /* DEBUG */
2720 int madvise_free_debug = 0;
2721 #endif /* DEBUG */
2722
2723 /*
2724  * Deactivate the pages in the specified object and range.  If kill_page is set, also discard any
2725  * page modified state from the pmap.  Update the chunk_state as we go along.  The caller must specify
2726  * a size that is less than or equal to the CHUNK_SIZE.
2727  */
2728
2729 static void
2730 deactivate_pages_in_object(
2731         vm_object_t             object,
2732         vm_object_offset_t      offset,
2733         vm_object_size_t        size,
2734         boolean_t               kill_page,
2735         boolean_t               reusable_page,
2736         boolean_t               all_reusable,
2737         chunk_state_t           *chunk_state,
2738         pmap_flush_context      *pfc)
2739 {
2740         vm_page_t       m;
2741         int             p;
2742         struct vm_page_delayed_work     dw_array[DEFAULT_DELAYED_WORK_LIMIT];
2743         struct vm_page_delayed_work     *dwp;
2744         int             dw_count;
2745         int             dw_limit;
2746         unsigned int    reusable = 0;
2747
2748         /*
2749          * Examine each page in the chunk.  The variable 'p' is the page number relative to the start of the
2750          * chunk.  Since this routine is called once for each level in the shadow chain, the chunk_state may
2751          * have pages marked as having been processed already.  We stop the loop early if we find we've handled
2752          * all the pages in the chunk.
2753          */
2754
2755         dwp = &dw_array[0];
2756         dw_count = 0;
2757         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
2758
2759         for(p = 0; size && CHUNK_NOT_COMPLETE(*chunk_state); p++, size -= PAGE_SIZE_64, offset += PAGE_SIZE_64) {
2760
2761                 /*
2762                  * If this offset has already been found and handled in a higher level object, then don't
2763                  * do anything with it in the current shadow object.
2764                  */
2765
2766                 if (PAGE_ALREADY_HANDLED(*chunk_state, p))
2767                         continue;
2768
2769                 /*
2770                  * See if the page at this offset is around.  First check to see if the page is resident,
2771                  * then if not, check the existence map or with the pager.
2772                  */
2773
2774                 if ((m = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
2775
2776                         /*
2777                          * We found a page we were looking for.  Mark it as "handled" now in the chunk_state
2778                          * so that we won't bother looking for a page at this offset again if there are more
2779                          * shadow objects.  Then deactivate the page.
2780                          */
2781
2782                         MARK_PAGE_HANDLED(*chunk_state, p);
2783
2784                         if (( !VM_PAGE_WIRED(m)) && (!m->private) && (!m->gobbled) && (!m->busy) && (!m->laundry)) {
2785                                 int     clear_refmod;
2786                                 int     pmap_options;
2787
2788                                 dwp->dw_mask = 0;
2789
2790                                 pmap_options = 0;
2791                                 clear_refmod = VM_MEM_REFERENCED;
2792                                 dwp->dw_mask |= DW_clear_reference;
2793
2794                                 if ((kill_page) && (object->internal)) {
2795                                         if (madvise_free_debug) {
2796                                                 /*
2797                                                  * zero-fill the page now
2798                                                  * to simulate it being
2799                                                  * reclaimed and re-faulted.
2800                                                  */
2801                                                 pmap_zero_page(m->phys_page);
2802                                         }
2803                                         m->precious = FALSE;
2804                                         m->dirty = FALSE;
2805
2806                                         clear_refmod |= VM_MEM_MODIFIED;
2807                                         if (m->throttled) {
2808                                                 /*
2809                                                  * This page is now clean and
2810                                                  * reclaimable.  Move it out
2811                                                  * of the throttled queue, so
2812                                                  * that vm_pageout_scan() can
2813                                                  * find it.
2814                                                  */
2815                                                 dwp->dw_mask |= DW_move_page;
2816                                         }
2817 #if     MACH_PAGEMAP
2818                                         vm_external_state_clr(object->existence_map, offset);
2819 #endif  /* MACH_PAGEMAP */
2820                                         VM_COMPRESSOR_PAGER_STATE_CLR(object,
2821                                                                       offset);
2822
2823                                         if (reusable_page && !m->reusable) {
2824                                                 assert(!all_reusable);
2825                                                 assert(!object->all_reusable);
2826                                                 m->reusable = TRUE;
2827                                                 object->reusable_page_count++;
2828                                                 assert(object->resident_page_count >= object->reusable_page_count);
2829                                                 reusable++;
2830                                                 /*
2831                                                  * Tell pmap this page is now
2832                                                  * "reusable" (to update pmap
2833                                                  * stats for all mappings).
2834                                                  */
2835                                                 pmap_options |= PMAP_OPTIONS_SET_REUSABLE;
2836                                         }
2837                                 }
2838                                 pmap_options |= PMAP_OPTIONS_NOFLUSH;
2839                                 pmap_clear_refmod_options(m->phys_page,
2840                                                           clear_refmod,
2841                                                           pmap_options,
2842                                                           (void *)pfc);
2843
2844                                 if (!m->throttled && !(reusable_page || all_reusable))
2845                                         dwp->dw_mask |= DW_move_page;
2846
2847                                 if (dwp->dw_mask)
2848                                         VM_PAGE_ADD_DELAYED_WORK(dwp, m,
2849                                                                  dw_count);
2850
2851                                 if (dw_count >= dw_limit) {
2852                                         if (reusable) {
2853                                                 OSAddAtomic(reusable,
2854                                                             &vm_page_stats_reusable.reusable_count);
2855                                                 vm_page_stats_reusable.reusable += reusable;
2856                                                 reusable = 0;
2857                                         }
2858                                         vm_page_do_delayed_work(object, &dw_array[0], dw_count);
2859
2860                                         dwp = &dw_array[0];
2861                                         dw_count = 0;
2862                                 }
2863                         }
2864
2865                 } else {
2866
2867                         /*
2868                          * The page at this offset isn't memory resident, check to see if it's
2869                          * been paged out.  If so, mark it as handled so we don't bother looking
2870                          * for it in the shadow chain.
2871                          */
2872
2873                         if (page_is_paged_out(object, offset)) {
2874                                 MARK_PAGE_HANDLED(*chunk_state, p);
2875
2876                                 /*
2877                                  * If we're killing a non-resident page, then clear the page in the existence
2878                                  * map so we don't bother paging it back in if it's touched again in the future.
2879                                  */
2880
2881                                 if ((kill_page) && (object->internal)) {
2882 #if     MACH_PAGEMAP
2883                                         vm_external_state_clr(object->existence_map, offset);
2884 #endif  /* MACH_PAGEMAP */
2885                                         VM_COMPRESSOR_PAGER_STATE_CLR(object,
2886                                                                       offset);
2887                                 }
2888                         }
2889                 }
2890         }
2891
2892         if (reusable) {
2893                 OSAddAtomic(reusable, &vm_page_stats_reusable.reusable_count);
2894                 vm_page_stats_reusable.reusable += reusable;
2895                 reusable = 0;
2896         }
2897
2898         if (dw_count)
2899                 vm_page_do_delayed_work(object, &dw_array[0], dw_count);
2900 }
2901
2902
2903 /*
2904  * Deactive a "chunk" of the given range of the object starting at offset.  A "chunk"
2905  * will always be less than or equal to the given size.  The total range is divided up
2906  * into chunks for efficiency and performance related to the locks and handling the shadow
2907  * chain.  This routine returns how much of the given "size" it actually processed.  It's
2908  * up to the caler to loop and keep calling this routine until the entire range they want
2909  * to process has been done.
2910  */
2911
2912 static vm_object_size_t
2913 deactivate_a_chunk(
2914         vm_object_t             orig_object,
2915         vm_object_offset_t      offset,
2916         vm_object_size_t        size,
2917         boolean_t               kill_page,
2918         boolean_t               reusable_page,
2919         boolean_t               all_reusable,
2920         pmap_flush_context      *pfc)
2921 {
2922         vm_object_t             object;
2923         vm_object_t             tmp_object;
2924         vm_object_size_t        length;
2925         chunk_state_t           chunk_state;
2926
2927
2928         /*
2929          * Get set to do a chunk.  We'll do up to CHUNK_SIZE, but no more than the
2930          * remaining size the caller asked for.
2931          */
2932
2933         length = MIN(size, CHUNK_SIZE);
2934
2935         /*
2936          * The chunk_state keeps track of which pages we've already processed if there's
2937          * a shadow chain on this object.  At this point, we haven't done anything with this
2938          * range of pages yet, so initialize the state to indicate no pages processed yet.
2939          */
2940
2941         CHUNK_INIT(chunk_state, length);
2942         object = orig_object;
2943
2944         /*
2945          * Start at the top level object and iterate around the loop once for each object
2946          * in the shadow chain.  We stop processing early if we've already found all the pages
2947          * in the range.  Otherwise we stop when we run out of shadow objects.
2948          */
2949
2950         while (object && CHUNK_NOT_COMPLETE(chunk_state)) {
2951                 vm_object_paging_begin(object);
2952
2953                 deactivate_pages_in_object(object, offset, length, kill_page, reusable_page, all_reusable, &chunk_state, pfc);
2954
2955                 vm_object_paging_end(object);
2956
2957                 /*
2958                  * We've finished with this object, see if there's a shadow object.  If
2959                  * there is, update the offset and lock the new object.  We also turn off
2960                  * kill_page at this point since we only kill pages in the top most object.
2961                  */
2962
2963                 tmp_object = object->shadow;
2964
2965                 if (tmp_object) {
2966                         kill_page = FALSE;
2967                         reusable_page = FALSE;
2968                         all_reusable = FALSE;
2969                         offset += object->vo_shadow_offset;
2970                         vm_object_lock(tmp_object);
2971                 }
2972
2973                 if (object != orig_object)
2974                         vm_object_unlock(object);
2975
2976                 object = tmp_object;
2977         }
2978
2979         if (object && object != orig_object)
2980                 vm_object_unlock(object);
2981
2982         return length;
2983 }
2984
2985
2986
2987 /*
2988  * Move any resident pages in the specified range to the inactive queue.  If kill_page is set,
2989  * we also clear the modified status of the page and "forget" any changes that have been made
2990  * to the page.
2991  */
2992
2993 __private_extern__ void
2994 vm_object_deactivate_pages(
2995         vm_object_t             object,
2996         vm_object_offset_t      offset,
2997         vm_object_size_t        size,
2998         boolean_t               kill_page,
2999         boolean_t               reusable_page)
3000 {
3001         vm_object_size_t        length;
3002         boolean_t               all_reusable;
3003         pmap_flush_context      pmap_flush_context_storage;
3004
3005         /*
3006          * We break the range up into chunks and do one chunk at a time.  This is for
3007          * efficiency and performance while handling the shadow chains and the locks.
3008          * The deactivate_a_chunk() function returns how much of the range it processed.
3009          * We keep calling this routine until the given size is exhausted.
3010          */
3011
3012
3013         all_reusable = FALSE;
3014 #if 11
3015         /*
3016          * For the sake of accurate "reusable" pmap stats, we need
3017          * to tell pmap about each page that is no longer "reusable",
3018          * so we can't do the "all_reusable" optimization.
3019          */
3020 #else
3021         if (reusable_page &&
3022             object->internal &&
3023             object->vo_size != 0 &&
3024             object->vo_size == size &&
3025             object->reusable_page_count == 0) {
3026                 all_reusable = TRUE;
3027                 reusable_page = FALSE;
3028         }
3029 #endif
3030
3031         if ((reusable_page || all_reusable) && object->all_reusable) {
3032                 /* This means MADV_FREE_REUSABLE has been called twice, which
3033                  * is probably illegal. */
3034                 return;
3035         }
3036
3037         pmap_flush_context_init(&pmap_flush_context_storage);
3038
3039         while (size) {
3040                 length = deactivate_a_chunk(object, offset, size, kill_page, reusable_page, all_reusable, &pmap_flush_context_storage);
3041
3042                 size -= length;
3043                 offset += length;
3044         }
3045         pmap_flush(&pmap_flush_context_storage);
3046
3047         if (all_reusable) {
3048                 if (!object->all_reusable) {
3049                         unsigned int reusable;
3050
3051                         object->all_reusable = TRUE;
3052                         assert(object->reusable_page_count == 0);
3053                         /* update global stats */
3054                         reusable = object->resident_page_count;
3055                         OSAddAtomic(reusable,
3056                                     &vm_page_stats_reusable.reusable_count);
3057                         vm_page_stats_reusable.reusable += reusable;
3058                         vm_page_stats_reusable.all_reusable_calls++;
3059                 }
3060         } else if (reusable_page) {
3061                 vm_page_stats_reusable.partial_reusable_calls++;
3062         }
3063 }
3064
3065 void
3066 vm_object_reuse_pages(
3067         vm_object_t             object,
3068         vm_object_offset_t      start_offset,
3069         vm_object_offset_t      end_offset,
3070         boolean_t               allow_partial_reuse)
3071 {
3072         vm_object_offset_t      cur_offset;
3073         vm_page_t               m;
3074         unsigned int            reused, reusable;
3075
3076 #define VM_OBJECT_REUSE_PAGE(object, m, reused)                         \
3077         MACRO_BEGIN                                                     \
3078                 if ((m) != VM_PAGE_NULL &&                              \
3079                     (m)->reusable) {                                    \
3080                         assert((object)->reusable_page_count <=         \
3081                                (object)->resident_page_count);          \
3082                         assert((object)->reusable_page_count > 0);      \
3083                         (object)->reusable_page_count--;                \
3084                         (m)->reusable = FALSE;                          \
3085                         (reused)++;                                     \
3086                         /*                                              \
3087                          * Tell pmap that this page is no longer        \
3088                          * "reusable", to update the "reusable" stats   \
3089                          * for all the pmaps that have mapped this      \
3090                          * page.                                        \
3091                          */                                             \
3092                         pmap_clear_refmod_options((m)->phys_page,       \
3093                                                   0, /* refmod */       \
3094                                                   (PMAP_OPTIONS_CLEAR_REUSABLE \
3095                                                    | PMAP_OPTIONS_NOFLUSH), \
3096                                                   NULL);                \
3097                 }                                                       \
3098         MACRO_END
3099
3100         reused = 0;
3101         reusable = 0;
3102
3103         vm_object_lock_assert_exclusive(object);
3104
3105         if (object->all_reusable) {
3106                 panic("object %p all_reusable: can't update pmap stats\n",
3107                       object);
3108                 assert(object->reusable_page_count == 0);
3109                 object->all_reusable = FALSE;
3110                 if (end_offset - start_offset == object->vo_size ||
3111                     !allow_partial_reuse) {
3112                         vm_page_stats_reusable.all_reuse_calls++;
3113                         reused = object->resident_page_count;
3114                 } else {
3115                         vm_page_stats_reusable.partial_reuse_calls++;
3116                         queue_iterate(&object->memq, m, vm_page_t, listq) {
3117                                 if (m->offset < start_offset ||
3118                                     m->offset >= end_offset) {
3119                                         m->reusable = TRUE;
3120                                         object->reusable_page_count++;
3121                                         assert(object->resident_page_count >= object->reusable_page_count);
3122                                         continue;
3123                                 } else {
3124                                         assert(!m->reusable);
3125                                         reused++;
3126                                 }
3127                         }
3128                 }
3129         } else if (object->resident_page_count >
3130                    ((end_offset - start_offset) >> PAGE_SHIFT)) {
3131                 vm_page_stats_reusable.partial_reuse_calls++;
3132                 for (cur_offset = start_offset;
3133                      cur_offset < end_offset;
3134                      cur_offset += PAGE_SIZE_64) {
3135                         if (object->reusable_page_count == 0) {
3136                                 break;
3137                         }
3138                         m = vm_page_lookup(object, cur_offset);
3139                         VM_OBJECT_REUSE_PAGE(object, m, reused);
3140                 }
3141         } else {
3142                 vm_page_stats_reusable.partial_reuse_calls++;
3143                 queue_iterate(&object->memq, m, vm_page_t, listq) {
3144                         if (object->reusable_page_count == 0) {
3145                                 break;
3146                         }
3147                         if (m->offset < start_offset ||
3148                             m->offset >= end_offset) {
3149                                 continue;
3150                         }
3151                         VM_OBJECT_REUSE_PAGE(object, m, reused);
3152                 }
3153         }
3154
3155         /* update global stats */
3156         OSAddAtomic(reusable-reused, &vm_page_stats_reusable.reusable_count);
3157         vm_page_stats_reusable.reused += reused;
3158         vm_page_stats_reusable.reusable += reusable;
3159 }
3160
3161 /*
3162  *      Routine:        vm_object_pmap_protect
3163  *
3164  *      Purpose:
3165  *              Reduces the permission for all physical
3166  *              pages in the specified object range.
3167  *
3168  *              If removing write permission only, it is
3169  *              sufficient to protect only the pages in
3170  *              the top-level object; only those pages may
3171  *              have write permission.
3172  *
3173  *              If removing all access, we must follow the
3174  *              shadow chain from the top-level object to
3175  *              remove access to all pages in shadowed objects.
3176  *
3177  *              The object must *not* be locked.  The object must
3178  *              be temporary/internal.
3179  *
3180  *              If pmap is not NULL, this routine assumes that
3181  *              the only mappings for the pages are in that
3182  *              pmap.
3183  */
3184
3185 __private_extern__ void
3186 vm_object_pmap_protect(
3187         register vm_object_t            object,
3188         register vm_object_offset_t     offset,
3189         vm_object_size_t                size,
3190         pmap_t                          pmap,
3191         vm_map_offset_t                 pmap_start,
3192         vm_prot_t                       prot)
3193 {
3194         vm_object_pmap_protect_options(object, offset, size,
3195                                        pmap, pmap_start, prot, 0);
3196 }
3197
3198 __private_extern__ void
3199 vm_object_pmap_protect_options(
3200         register vm_object_t            object,
3201         register vm_object_offset_t     offset,
3202         vm_object_size_t                size,
3203         pmap_t                          pmap,
3204         vm_map_offset_t                 pmap_start,
3205         vm_prot_t                       prot,
3206         int                             options)
3207 {
3208         pmap_flush_context      pmap_flush_context_storage;
3209         boolean_t               delayed_pmap_flush = FALSE;
3210
3211         if (object == VM_OBJECT_NULL)
3212                 return;
3213         size = vm_object_round_page(size);
3214         offset = vm_object_trunc_page(offset);
3215
3216         vm_object_lock(object);
3217
3218         if (object->phys_contiguous) {
3219                 if (pmap != NULL) {
3220                         vm_object_unlock(object);
3221                         pmap_protect_options(pmap,
3222                                              pmap_start,
3223                                              pmap_start + size,
3224                                              prot,
3225                                              options & ~PMAP_OPTIONS_NOFLUSH,
3226                                              NULL);
3227                 } else {
3228                         vm_object_offset_t phys_start, phys_end, phys_addr;
3229
3230                         phys_start = object->vo_shadow_offset + offset;
3231                         phys_end = phys_start + size;
3232                         assert(phys_start <= phys_end);
3233                         assert(phys_end <= object->vo_shadow_offset + object->vo_size);
3234                         vm_object_unlock(object);
3235
3236                         pmap_flush_context_init(&pmap_flush_context_storage);
3237                         delayed_pmap_flush = FALSE;
3238
3239                         for (phys_addr = phys_start;
3240                              phys_addr < phys_end;
3241                              phys_addr += PAGE_SIZE_64) {
3242                                 pmap_page_protect_options(
3243                                         (ppnum_t) (phys_addr >> PAGE_SHIFT),
3244                                         prot,
3245                                         options | PMAP_OPTIONS_NOFLUSH,
3246                                         (void *)&pmap_flush_context_storage);
3247                                 delayed_pmap_flush = TRUE;
3248                         }
3249                         if (delayed_pmap_flush == TRUE)
3250                                 pmap_flush(&pmap_flush_context_storage);
3251                 }
3252                 return;
3253         }
3254
3255         assert(object->internal);
3256
3257         while (TRUE) {
3258            if (ptoa_64(object->resident_page_count) > size/2 && pmap != PMAP_NULL) {
3259                 vm_object_unlock(object);
3260                 pmap_protect_options(pmap, pmap_start, pmap_start + size, prot,
3261                                      options & ~PMAP_OPTIONS_NOFLUSH, NULL);
3262                 return;
3263             }
3264
3265            pmap_flush_context_init(&pmap_flush_context_storage);
3266            delayed_pmap_flush = FALSE;
3267
3268             /*
3269              * if we are doing large ranges with respect to resident
3270              * page count then we should interate over pages otherwise
3271              * inverse page look-up will be faster
3272              */
3273             if (ptoa_64(object->resident_page_count / 4) <  size) {
3274                 vm_page_t               p;
3275                 vm_object_offset_t      end;
3276
3277                 end = offset + size;
3278
3279                 queue_iterate(&object->memq, p, vm_page_t, listq) {
3280                         if (!p->fictitious && (offset <= p->offset) && (p->offset < end)) {
3281                                 vm_map_offset_t start;
3282
3283                                 start = pmap_start + p->offset - offset;
3284
3285                                 if (pmap != PMAP_NULL)
3286                                         pmap_protect_options(
3287                                                 pmap,
3288                                                 start,
3289                                                 start + PAGE_SIZE_64,
3290                                                 prot,
3291                                                 options | PMAP_OPTIONS_NOFLUSH,
3292                                                 &pmap_flush_context_storage);
3293                                 else
3294                                         pmap_page_protect_options(
3295                                                 p->phys_page,
3296                                                 prot,
3297                                                 options | PMAP_OPTIONS_NOFLUSH,
3298                                                 &pmap_flush_context_storage);
3299                                         delayed_pmap_flush = TRUE;
3300                         }
3301                 }
3302
3303            } else {
3304                 vm_page_t               p;
3305                 vm_object_offset_t      end;
3306                 vm_object_offset_t      target_off;
3307
3308                 end = offset + size;
3309
3310                 for (target_off = offset;
3311                      target_off < end; target_off += PAGE_SIZE) {
3312
3313                         p = vm_page_lookup(object, target_off);
3314
3315                         if (p != VM_PAGE_NULL) {
3316                                 vm_object_offset_t start;
3317
3318                                 start = pmap_start + (p->offset - offset);
3319
3320                                 if (pmap != PMAP_NULL)
3321                                         pmap_protect_options(
3322                                                 pmap,
3323                                                 start,
3324                                                 start + PAGE_SIZE_64,
3325                                                 prot,
3326                                                 options | PMAP_OPTIONS_NOFLUSH,
3327                                                 &pmap_flush_context_storage);
3328                                 else
3329                                         pmap_page_protect_options(
3330                                                 p->phys_page,
3331                                                 prot,
3332                                                 options | PMAP_OPTIONS_NOFLUSH,
3333                                                 &pmap_flush_context_storage);
3334                                         delayed_pmap_flush = TRUE;
3335                         }
3336                 }
3337             }
3338             if (delayed_pmap_flush == TRUE)
3339                     pmap_flush(&pmap_flush_context_storage);
3340
3341             if (prot == VM_PROT_NONE) {
3342                 /*
3343                  * Must follow shadow chain to remove access
3344                  * to pages in shadowed objects.
3345                  */
3346                 register vm_object_t    next_object;
3347
3348                 next_object = object->shadow;
3349                 if (next_object != VM_OBJECT_NULL) {
3350                     offset += object->vo_shadow_offset;
3351                     vm_object_lock(next_object);
3352                     vm_object_unlock(object);
3353                     object = next_object;
3354                 }
3355                 else {
3356                     /*
3357                      * End of chain - we are done.
3358                      */
3359                     break;
3360                 }
3361             }
3362             else {
3363                 /*
3364                  * Pages in shadowed objects may never have
3365                  * write permission - we may stop here.
3366                  */
3367                 break;
3368             }
3369         }
3370
3371         vm_object_unlock(object);
3372 }
3373
3374 /*
3375  *      Routine:        vm_object_copy_slowly
3376  *
3377  *      Description:
3378  *              Copy the specified range of the source
3379  *              virtual memory object without using
3380  *              protection-based optimizations (such
3381  *              as copy-on-write).  The pages in the
3382  *              region are actually copied.
3383  *
3384  *      In/out conditions:
3385  *              The caller must hold a reference and a lock
3386  *              for the source virtual memory object.  The source
3387  *              object will be returned *unlocked*.
3388  *
3389  *      Results:
3390  *              If the copy is completed successfully, KERN_SUCCESS is
3391  *              returned.  If the caller asserted the interruptible
3392  *              argument, and an interruption occurred while waiting
3393  *              for a user-generated event, MACH_SEND_INTERRUPTED is
3394  *              returned.  Other values may be returned to indicate
3395  *              hard errors during the copy operation.
3396  *
3397  *              A new virtual memory object is returned in a
3398  *              parameter (_result_object).  The contents of this
3399  *              new object, starting at a zero offset, are a copy
3400  *              of the source memory region.  In the event of
3401  *              an error, this parameter will contain the value
3402  *              VM_OBJECT_NULL.
3403  */
3404 __private_extern__ kern_return_t
3405 vm_object_copy_slowly(
3406         register vm_object_t    src_object,
3407         vm_object_offset_t      src_offset,
3408         vm_object_size_t        size,
3409         boolean_t               interruptible,
3410         vm_object_t             *_result_object)        /* OUT */
3411 {
3412         vm_object_t             new_object;
3413         vm_object_offset_t      new_offset;
3414
3415         struct vm_object_fault_info fault_info;
3416
3417         XPR(XPR_VM_OBJECT, "v_o_c_slowly obj 0x%x off 0x%x size 0x%x\n",
3418             src_object, src_offset, size, 0, 0);
3419
3420         if (size == 0) {
3421                 vm_object_unlock(src_object);
3422                 *_result_object = VM_OBJECT_NULL;
3423                 return(KERN_INVALID_ARGUMENT);
3424         }
3425
3426         /*
3427          *      Prevent destruction of the source object while we copy.
3428          */
3429
3430         vm_object_reference_locked(src_object);
3431         vm_object_unlock(src_object);
3432
3433         /*
3434          *      Create a new object to hold the copied pages.
3435          *      A few notes:
3436          *              We fill the new object starting at offset 0,
3437          *               regardless of the input offset.
3438          *              We don't bother to lock the new object within
3439          *               this routine, since we have the only reference.
3440          */
3441
3442         new_object = vm_object_allocate(size);
3443         new_offset = 0;
3444
3445         assert(size == trunc_page_64(size));    /* Will the loop terminate? */
3446
3447         fault_info.interruptible = interruptible;
3448         fault_info.behavior  = VM_BEHAVIOR_SEQUENTIAL;
3449         fault_info.user_tag = 0;
3450         fault_info.pmap_options = 0;
3451         fault_info.lo_offset = src_offset;
3452         fault_info.hi_offset = src_offset + size;
3453         fault_info.no_cache  = FALSE;
3454         fault_info.stealth = TRUE;
3455         fault_info.io_sync = FALSE;
3456         fault_info.cs_bypass = FALSE;
3457         fault_info.mark_zf_absent = FALSE;
3458         fault_info.batch_pmap_op = FALSE;
3459
3460         for ( ;
3461             size != 0 ;
3462             src_offset += PAGE_SIZE_64,
3463                         new_offset += PAGE_SIZE_64, size -= PAGE_SIZE_64
3464             ) {
3465                 vm_page_t       new_page;
3466                 vm_fault_return_t result;
3467
3468                 vm_object_lock(new_object);
3469
3470                 while ((new_page = vm_page_alloc(new_object, new_offset))
3471                                 == VM_PAGE_NULL) {
3472
3473                         vm_object_unlock(new_object);
3474
3475                         if (!vm_page_wait(interruptible)) {
3476                                 vm_object_deallocate(new_object);
3477                                 vm_object_deallocate(src_object);
3478                                 *_result_object = VM_OBJECT_NULL;
3479                                 return(MACH_SEND_INTERRUPTED);
3480                         }
3481                         vm_object_lock(new_object);
3482                 }
3483                 vm_object_unlock(new_object);
3484
3485                 do {
3486                         vm_prot_t       prot = VM_PROT_READ;
3487                         vm_page_t       _result_page;
3488                         vm_page_t       top_page;
3489                         register
3490                         vm_page_t       result_page;
3491                         kern_return_t   error_code;
3492
3493                         vm_object_lock(src_object);
3494                         vm_object_paging_begin(src_object);
3495
3496                         if (size > (vm_size_t) -1) {
3497                                 /* 32-bit overflow */
3498                                 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
3499                         } else {
3500                                 fault_info.cluster_size = (vm_size_t) size;
3501                                 assert(fault_info.cluster_size == size);
3502                         }
3503
3504                         XPR(XPR_VM_FAULT,"vm_object_copy_slowly -> vm_fault_page",0,0,0,0,0);
3505                         _result_page = VM_PAGE_NULL;
3506                         result = vm_fault_page(src_object, src_offset,
3507                                 VM_PROT_READ, FALSE,
3508                                 FALSE, /* page not looked up */
3509                                 &prot, &_result_page, &top_page,
3510                                 (int *)0,
3511                                 &error_code, FALSE, FALSE, &fault_info);
3512
3513                         switch(result) {
3514                         case VM_FAULT_SUCCESS:
3515                                 result_page = _result_page;
3516
3517                                 /*
3518                                  *      Copy the page to the new object.
3519                                  *
3520                                  *      POLICY DECISION:
3521                                  *              If result_page is clean,
3522                                  *              we could steal it instead
3523                                  *              of copying.
3524                                  */
3525
3526                                 vm_page_copy(result_page, new_page);
3527                                 vm_object_unlock(result_page->object);
3528
3529                                 /*
3530                                  *      Let go of both pages (make them
3531                                  *      not busy, perform wakeup, activate).
3532                                  */
3533                                 vm_object_lock(new_object);
3534                                 SET_PAGE_DIRTY(new_page, FALSE);
3535                                 PAGE_WAKEUP_DONE(new_page);
3536                                 vm_object_unlock(new_object);
3537
3538                                 vm_object_lock(result_page->object);
3539                                 PAGE_WAKEUP_DONE(result_page);
3540
3541                                 vm_page_lockspin_queues();
3542                                 if (!result_page->active &&
3543                                     !result_page->inactive &&
3544                                     !result_page->throttled)
3545                                         vm_page_activate(result_page);
3546                                 vm_page_activate(new_page);
3547                                 vm_page_unlock_queues();
3548
3549                                 /*
3550                                  *      Release paging references and
3551                                  *      top-level placeholder page, if any.
3552                                  */
3553
3554                                 vm_fault_cleanup(result_page->object,
3555                                                  top_page);
3556
3557                                 break;
3558
3559                         case VM_FAULT_RETRY:
3560                                 break;
3561
3562                         case VM_FAULT_MEMORY_SHORTAGE:
3563                                 if (vm_page_wait(interruptible))
3564                                         break;
3565                                 /* fall thru */
3566
3567                         case VM_FAULT_INTERRUPTED:
3568                                 vm_object_lock(new_object);
3569                                 VM_PAGE_FREE(new_page);
3570                                 vm_object_unlock(new_object);
3571
3572                                 vm_object_deallocate(new_object);
3573                                 vm_object_deallocate(src_object);
3574                                 *_result_object = VM_OBJECT_NULL;
3575                                 return(MACH_SEND_INTERRUPTED);
3576
3577                         case VM_FAULT_SUCCESS_NO_VM_PAGE:
3578                                 /* success but no VM page: fail */
3579                                 vm_object_paging_end(src_object);
3580                                 vm_object_unlock(src_object);
3581                                 /*FALLTHROUGH*/
3582                         case VM_FAULT_MEMORY_ERROR:
3583                                 /*
3584                                  * A policy choice:
3585                                  *      (a) ignore pages that we can't
3586                                  *          copy
3587                                  *      (b) return the null object if
3588                                  *          any page fails [chosen]
3589                                  */
3590
3591                                 vm_object_lock(new_object);
3592                                 VM_PAGE_FREE(new_page);
3593                                 vm_object_unlock(new_object);
3594
3595                                 vm_object_deallocate(new_object);
3596                                 vm_object_deallocate(src_object);
3597                                 *_result_object = VM_OBJECT_NULL;
3598                                 return(error_code ? error_code:
3599                                        KERN_MEMORY_ERROR);
3600
3601                         default:
3602                                 panic("vm_object_copy_slowly: unexpected error"
3603                                       " 0x%x from vm_fault_page()\n", result);
3604                         }
3605                 } while (result != VM_FAULT_SUCCESS);
3606         }
3607
3608         /*
3609          *      Lose the extra reference, and return our object.
3610          */
3611         vm_object_deallocate(src_object);
3612         *_result_object = new_object;
3613         return(KERN_SUCCESS);
3614 }
3615
3616 /*
3617  *      Routine:        vm_object_copy_quickly
3618  *
3619  *      Purpose:
3620  *              Copy the specified range of the source virtual
3621  *              memory object, if it can be done without waiting
3622  *              for user-generated events.
3623  *
3624  *      Results:
3625  *              If the copy is successful, the copy is returned in
3626  *              the arguments; otherwise, the arguments are not
3627  *              affected.
3628  *
3629  *      In/out conditions:
3630  *              The object should be unlocked on entry and exit.
3631  */
3632
3633 /*ARGSUSED*/
3634 __private_extern__ boolean_t
3635 vm_object_copy_quickly(
3636         vm_object_t             *_object,               /* INOUT */
3637         __unused vm_object_offset_t     offset, /* IN */
3638         __unused vm_object_size_t       size,   /* IN */
3639         boolean_t               *_src_needs_copy,       /* OUT */
3640         boolean_t               *_dst_needs_copy)       /* OUT */
3641 {
3642         vm_object_t     object = *_object;
3643         memory_object_copy_strategy_t copy_strategy;
3644
3645         XPR(XPR_VM_OBJECT, "v_o_c_quickly obj 0x%x off 0x%x size 0x%x\n",
3646             *_object, offset, size, 0, 0);
3647         if (object == VM_OBJECT_NULL) {
3648                 *_src_needs_copy = FALSE;
3649                 *_dst_needs_copy = FALSE;
3650                 return(TRUE);
3651         }
3652
3653         vm_object_lock(object);
3654
3655         copy_strategy = object->copy_strategy;
3656
3657         switch (copy_strategy) {
3658         case MEMORY_OBJECT_COPY_SYMMETRIC:
3659
3660                 /*
3661                  *      Symmetric copy strategy.
3662                  *      Make another reference to the object.
3663                  *      Leave object/offset unchanged.
3664                  */
3665
3666                 vm_object_reference_locked(object);
3667                 object->shadowed = TRUE;
3668                 vm_object_unlock(object);
3669
3670                 /*
3671                  *      Both source and destination must make
3672                  *      shadows, and the source must be made
3673                  *      read-only if not already.
3674                  */
3675
3676                 *_src_needs_copy = TRUE;
3677                 *_dst_needs_copy = TRUE;
3678
3679                 break;
3680
3681         case MEMORY_OBJECT_COPY_DELAY:
3682                 vm_object_unlock(object);
3683                 return(FALSE);
3684
3685         default:
3686                 vm_object_unlock(object);
3687                 return(FALSE);
3688         }
3689         return(TRUE);
3690 }
3691
3692 static int copy_call_count = 0;
3693 static int copy_call_sleep_count = 0;
3694 static int copy_call_restart_count = 0;
3695
3696 /*
3697  *      Routine:        vm_object_copy_call [internal]
3698  *
3699  *      Description:
3700  *              Copy the source object (src_object), using the
3701  *              user-managed copy algorithm.
3702  *
3703  *      In/out conditions:
3704  *              The source object must be locked on entry.  It
3705  *              will be *unlocked* on exit.
3706  *
3707  *      Results:
3708  *              If the copy is successful, KERN_SUCCESS is returned.
3709  *              A new object that represents the copied virtual
3710  *              memory is returned in a parameter (*_result_object).
3711  *              If the return value indicates an error, this parameter
3712  *              is not valid.
3713  */
3714 static kern_return_t
3715 vm_object_copy_call(
3716         vm_object_t             src_object,
3717         vm_object_offset_t      src_offset,
3718         vm_object_size_t        size,
3719         vm_object_t             *_result_object)        /* OUT */
3720 {
3721         kern_return_t   kr;
3722         vm_object_t     copy;
3723         boolean_t       check_ready = FALSE;
3724         uint32_t        try_failed_count = 0;
3725
3726         /*
3727          *      If a copy is already in progress, wait and retry.
3728          *
3729          *      XXX
3730          *      Consider making this call interruptable, as Mike
3731          *      intended it to be.
3732          *
3733          *      XXXO
3734          *      Need a counter or version or something to allow
3735          *      us to use the copy that the currently requesting
3736          *      thread is obtaining -- is it worth adding to the
3737          *      vm object structure? Depends how common this case it.
3738          */
3739         copy_call_count++;
3740         while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) {
3741                 vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL,
3742                                THREAD_UNINT);
3743                 copy_call_restart_count++;
3744         }
3745
3746         /*
3747          *      Indicate (for the benefit of memory_object_create_copy)
3748          *      that we want a copy for src_object. (Note that we cannot
3749          *      do a real assert_wait before calling memory_object_copy,
3750          *      so we simply set the flag.)
3751          */
3752
3753         vm_object_set_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL);
3754         vm_object_unlock(src_object);
3755
3756         /*
3757          *      Ask the memory manager to give us a memory object
3758          *      which represents a copy of the src object.
3759          *      The memory manager may give us a memory object
3760          *      which we already have, or it may give us a
3761          *      new memory object. This memory object will arrive
3762          *      via memory_object_create_copy.
3763          */
3764
3765         kr = KERN_FAILURE;      /* XXX need to change memory_object.defs */
3766         if (kr != KERN_SUCCESS) {
3767                 return kr;
3768         }
3769
3770         /*
3771          *      Wait for the copy to arrive.
3772          */
3773         vm_object_lock(src_object);
3774         while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) {
3775                 vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL,
3776                                THREAD_UNINT);
3777                 copy_call_sleep_count++;
3778         }
3779 Retry:
3780         assert(src_object->copy != VM_OBJECT_NULL);
3781         copy = src_object->copy;
3782         if (!vm_object_lock_try(copy)) {
3783                 vm_object_unlock(src_object);
3784
3785                 try_failed_count++;
3786                 mutex_pause(try_failed_count);  /* wait a bit */
3787
3788                 vm_object_lock(src_object);
3789                 goto Retry;
3790         }
3791         if (copy->vo_size < src_offset+size)
3792                 copy->vo_size = src_offset+size;
3793
3794         if (!copy->pager_ready)
3795                 check_ready = TRUE;
3796
3797         /*
3798          *      Return the copy.
3799          */
3800         *_result_object = copy;
3801         vm_object_unlock(copy);
3802         vm_object_unlock(src_object);
3803
3804         /* Wait for the copy to be ready. */
3805         if (check_ready == TRUE) {
3806                 vm_object_lock(copy);
3807                 while (!copy->pager_ready) {
3808                         vm_object_sleep(copy, VM_OBJECT_EVENT_PAGER_READY, THREAD_UNINT);
3809                 }
3810                 vm_object_unlock(copy);
3811         }
3812
3813         return KERN_SUCCESS;
3814 }
3815
3816 static int copy_delayed_lock_collisions = 0;
3817 static int copy_delayed_max_collisions = 0;
3818 static int copy_delayed_lock_contention = 0;
3819 static int copy_delayed_protect_iterate = 0;
3820
3821 /*
3822  *      Routine:        vm_object_copy_delayed [internal]
3823  *
3824  *      Description:
3825  *              Copy the specified virtual memory object, using
3826  *              the asymmetric copy-on-write algorithm.
3827  *
3828  *      In/out conditions:
3829  *              The src_object must be locked on entry.  It will be unlocked
3830  *              on exit - so the caller must also hold a reference to it.
3831  *
3832  *              This routine will not block waiting for user-generated
3833  *              events.  It is not interruptible.
3834  */
3835 __private_extern__ vm_object_t
3836 vm_object_copy_delayed(
3837         vm_object_t             src_object,
3838         vm_object_offset_t      src_offset,
3839         vm_object_size_t        size,
3840         boolean_t               src_object_shared)
3841 {
3842         vm_object_t             new_copy = VM_OBJECT_NULL;
3843         vm_object_t             old_copy;
3844         vm_page_t               p;
3845         vm_object_size_t        copy_size = src_offset + size;
3846         pmap_flush_context      pmap_flush_context_storage;
3847         boolean_t               delayed_pmap_flush = FALSE;
3848
3849
3850         int collisions = 0;
3851         /*
3852          *      The user-level memory manager wants to see all of the changes
3853          *      to this object, but it has promised not to make any changes on
3854          *      its own.
3855          *
3856          *      Perform an asymmetric copy-on-write, as follows:
3857          *              Create a new object, called a "copy object" to hold
3858          *               pages modified by the new mapping  (i.e., the copy,
3859          *               not the original mapping).
3860          *              Record the original object as the backing object for
3861          *               the copy object.  If the original mapping does not
3862          *               change a page, it may be used read-only by the copy.
3863          *              Record the copy object in the original object.
3864          *               When the original mapping causes a page to be modified,
3865          *               it must be copied to a new page that is "pushed" to
3866          *               the copy object.
3867          *              Mark the new mapping (the copy object) copy-on-write.
3868          *               This makes the copy object itself read-only, allowing
3869          *               it to be reused if the original mapping makes no
3870          *               changes, and simplifying the synchronization required
3871          *               in the "push" operation described above.
3872          *
3873          *      The copy-on-write is said to be assymetric because the original
3874          *      object is *not* marked copy-on-write. A copied page is pushed
3875          *      to the copy object, regardless which party attempted to modify
3876          *      the page.
3877          *
3878          *      Repeated asymmetric copy operations may be done. If the
3879          *      original object has not been changed since the last copy, its
3880          *      copy object can be reused. Otherwise, a new copy object can be
3881          *      inserted between the original object and its previous copy
3882          *      object.  Since any copy object is read-only, this cannot affect
3883          *      affect the contents of the previous copy object.
3884          *
3885          *      Note that a copy object is higher in the object tree than the
3886          *      original object; therefore, use of the copy object recorded in
3887          *      the original object must be done carefully, to avoid deadlock.
3888          */
3889
3890  Retry:
3891
3892         /*
3893          * Wait for paging in progress.
3894          */
3895         if (!src_object->true_share &&
3896             (src_object->paging_in_progress != 0 ||
3897              src_object->activity_in_progress != 0)) {
3898                 if (src_object_shared == TRUE) {
3899                         vm_object_unlock(src_object);
3900                         vm_object_lock(src_object);
3901                         src_object_shared = FALSE;
3902                         goto Retry;
3903                 }
3904                 vm_object_paging_wait(src_object, THREAD_UNINT);
3905         }
3906         /*
3907          *      See whether we can reuse the result of a previous
3908          *      copy operation.
3909          */
3910
3911         old_copy = src_object->copy;
3912         if (old_copy != VM_OBJECT_NULL) {
3913                 int lock_granted;
3914
3915                 /*
3916                  *      Try to get the locks (out of order)
3917                  */
3918                 if (src_object_shared == TRUE)
3919                         lock_granted = vm_object_lock_try_shared(old_copy);
3920                 else
3921                         lock_granted = vm_object_lock_try(old_copy);
3922
3923                 if (!lock_granted) {
3924                         vm_object_unlock(src_object);
3925
3926                         if (collisions++ == 0)
3927                                 copy_delayed_lock_contention++;
3928                         mutex_pause(collisions);
3929
3930                         /* Heisenberg Rules */
3931                         copy_delayed_lock_collisions++;
3932
3933                         if (collisions > copy_delayed_max_collisions)
3934                                 copy_delayed_max_collisions = collisions;
3935
3936                         if (src_object_shared == TRUE)
3937                                 vm_object_lock_shared(src_object);
3938                         else
3939                                 vm_object_lock(src_object);
3940
3941                         goto Retry;
3942                 }
3943
3944                 /*
3945                  *      Determine whether the old copy object has
3946                  *      been modified.
3947                  */
3948
3949                 if (old_copy->resident_page_count == 0 &&
3950                     !old_copy->pager_created) {
3951                         /*
3952                          *      It has not been modified.
3953                          *
3954                          *      Return another reference to
3955                          *      the existing copy-object if
3956                          *      we can safely grow it (if
3957                          *      needed).
3958                          */
3959
3960                         if (old_copy->vo_size < copy_size) {
3961                                 if (src_object_shared == TRUE) {
3962                                         vm_object_unlock(old_copy);
3963                                         vm_object_unlock(src_object);
3964
3965                                         vm_object_lock(src_object);
3966                                         src_object_shared = FALSE;
3967                                         goto Retry;
3968                                 }
3969                                 /*
3970                                  * We can't perform a delayed copy if any of the
3971                                  * pages in the extended range are wired (because
3972                                  * we can't safely take write permission away from
3973                                  * wired pages).  If the pages aren't wired, then
3974                                  * go ahead and protect them.
3975                                  */
3976                                 copy_delayed_protect_iterate++;
3977
3978                                 pmap_flush_context_init(&pmap_flush_context_storage);
3979                                 delayed_pmap_flush = FALSE;
3980
3981                                 queue_iterate(&src_object->memq, p, vm_page_t, listq) {
3982                                         if (!p->fictitious &&
3983                                             p->offset >= old_copy->vo_size &&
3984                                             p->offset < copy_size) {
3985                                                 if (VM_PAGE_WIRED(p)) {
3986                                                         vm_object_unlock(old_copy);
3987                                                         vm_object_unlock(src_object);
3988
3989                                                         if (new_copy != VM_OBJECT_NULL) {
3990                                                                 vm_object_unlock(new_copy);
3991                                                                 vm_object_deallocate(new_copy);
3992                                                         }
3993                                                         if (delayed_pmap_flush == TRUE)
3994                                                                 pmap_flush(&pmap_flush_context_storage);
3995
3996                                                         return VM_OBJECT_NULL;
3997                                                 } else {
3998                                                         pmap_page_protect_options(p->phys_page, (VM_PROT_ALL & ~VM_PROT_WRITE),
3999                                                                                   PMAP_OPTIONS_NOFLUSH, (void *)&pmap_flush_context_storage);
4000                                                         delayed_pmap_flush = TRUE;
4001                                                 }
4002                                         }
4003                                 }
4004                                 if (delayed_pmap_flush == TRUE)
4005                                         pmap_flush(&pmap_flush_context_storage);
4006
4007                                 old_copy->vo_size = copy_size;
4008                         }
4009                         if (src_object_shared == TRUE)
4010                                 vm_object_reference_shared(old_copy);
4011                         else
4012                                 vm_object_reference_locked(old_copy);
4013                         vm_object_unlock(old_copy);
4014                         vm_object_unlock(src_object);
4015
4016                         if (new_copy != VM_OBJECT_NULL) {
4017                                 vm_object_unlock(new_copy);
4018                                 vm_object_deallocate(new_copy);
4019                         }
4020                         return(old_copy);
4021                 }
4022
4023
4024
4025                 /*
4026                  * Adjust the size argument so that the newly-created
4027                  * copy object will be large enough to back either the
4028                  * old copy object or the new mapping.
4029                  */
4030                 if (old_copy->vo_size > copy_size)
4031                         copy_size = old_copy->vo_size;
4032
4033                 if (new_copy == VM_OBJECT_NULL) {
4034                         vm_object_unlock(old_copy);
4035                         vm_object_unlock(src_object);
4036                         new_copy = vm_object_allocate(copy_size);
4037                         vm_object_lock(src_object);
4038                         vm_object_lock(new_copy);
4039
4040                         src_object_shared = FALSE;
4041                         goto Retry;
4042                 }
4043                 new_copy->vo_size = copy_size;
4044
4045                 /*
4046                  *      The copy-object is always made large enough to
4047                  *      completely shadow the original object, since
4048                  *      it may have several users who want to shadow
4049                  *      the original object at different points.
4050                  */
4051
4052                 assert((old_copy->shadow == src_object) &&
4053                     (old_copy->vo_shadow_offset == (vm_object_offset_t) 0));
4054
4055         } else if (new_copy == VM_OBJECT_NULL) {
4056                 vm_object_unlock(src_object);
4057                 new_copy = vm_object_allocate(copy_size);
4058                 vm_object_lock(src_object);
4059                 vm_object_lock(new_copy);
4060
4061                 src_object_shared = FALSE;
4062                 goto Retry;
4063         }
4064
4065         /*
4066          * We now have the src object locked, and the new copy object
4067          * allocated and locked (and potentially the old copy locked).
4068          * Before we go any further, make sure we can still perform
4069          * a delayed copy, as the situation may have changed.
4070          *
4071          * Specifically, we can't perform a delayed copy if any of the
4072          * pages in the range are wired (because we can't safely take
4073          * write permission away from wired pages).  If the pages aren't
4074          * wired, then go ahead and protect them.
4075          */
4076         copy_delayed_protect_iterate++;
4077
4078         pmap_flush_context_init(&pmap_flush_context_storage);
4079         delayed_pmap_flush = FALSE;
4080
4081         queue_iterate(&src_object->memq, p, vm_page_t, listq) {
4082                 if (!p->fictitious && p->offset < copy_size) {
4083                         if (VM_PAGE_WIRED(p)) {
4084                                 if (old_copy)
4085                                         vm_object_unlock(old_copy);
4086                                 vm_object_unlock(src_object);
4087                                 vm_object_unlock(new_copy);
4088                                 vm_object_deallocate(new_copy);
4089
4090                                 if (delayed_pmap_flush == TRUE)
4091                                         pmap_flush(&pmap_flush_context_storage);
4092
4093                                 return VM_OBJECT_NULL;
4094                         } else {
4095                                 pmap_page_protect_options(p->phys_page, (VM_PROT_ALL & ~VM_PROT_WRITE),
4096                                                           PMAP_OPTIONS_NOFLUSH, (void *)&pmap_flush_context_storage);
4097                                 delayed_pmap_flush = TRUE;
4098                         }
4099                 }
4100         }
4101         if (delayed_pmap_flush == TRUE)
4102                 pmap_flush(&pmap_flush_context_storage);
4103
4104         if (old_copy != VM_OBJECT_NULL) {
4105                 /*
4106                  *      Make the old copy-object shadow the new one.
4107                  *      It will receive no more pages from the original
4108                  *      object.
4109                  */
4110
4111                 /* remove ref. from old_copy */
4112                 vm_object_lock_assert_exclusive(src_object);
4113                 src_object->ref_count--;
4114                 assert(src_object->ref_count > 0);
4115                 vm_object_lock_assert_exclusive(old_copy);
4116                 old_copy->shadow = new_copy;
4117                 vm_object_lock_assert_exclusive(new_copy);
4118                 assert(new_copy->ref_count > 0);
4119                 new_copy->ref_count++;          /* for old_copy->shadow ref. */
4120
4121 #if TASK_SWAPPER
4122                 if (old_copy->res_count) {
4123                         VM_OBJ_RES_INCR(new_copy);
4124                         VM_OBJ_RES_DECR(src_object);
4125                 }
4126 #endif
4127
4128                 vm_object_unlock(old_copy);     /* done with old_copy */
4129         }
4130
4131         /*
4132          *      Point the new copy at the existing object.
4133          */
4134         vm_object_lock_assert_exclusive(new_copy);
4135         new_copy->shadow = src_object;
4136         new_copy->vo_shadow_offset = 0;
4137         new_copy->shadowed = TRUE;      /* caller must set needs_copy */
4138
4139         vm_object_lock_assert_exclusive(src_object);
4140         vm_object_reference_locked(src_object);
4141         src_object->copy = new_copy;
4142         vm_object_unlock(src_object);
4143         vm_object_unlock(new_copy);
4144
4145         XPR(XPR_VM_OBJECT,
4146                 "vm_object_copy_delayed: used copy object %X for source %X\n",
4147                 new_copy, src_object, 0, 0, 0);
4148
4149         return new_copy;
4150 }
4151
4152 /*
4153  *      Routine:        vm_object_copy_strategically
4154  *
4155  *      Purpose:
4156  *              Perform a copy according to the source object's
4157  *              declared strategy.  This operation may block,
4158  *              and may be interrupted.
4159  */
4160 __private_extern__ kern_return_t
4161 vm_object_copy_strategically(
4162         register vm_object_t    src_object,
4163         vm_object_offset_t      src_offset,
4164         vm_object_size_t        size,
4165         vm_object_t             *dst_object,    /* OUT */
4166         vm_object_offset_t      *dst_offset,    /* OUT */
4167         boolean_t               *dst_needs_copy) /* OUT */
4168 {
4169         boolean_t       result;
4170         boolean_t       interruptible = THREAD_ABORTSAFE; /* XXX */
4171         boolean_t       object_lock_shared = FALSE;
4172         memory_object_copy_strategy_t copy_strategy;
4173
4174         assert(src_object != VM_OBJECT_NULL);
4175
4176         copy_strategy = src_object->copy_strategy;
4177
4178         if (copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4179                 vm_object_lock_shared(src_object);
4180                 object_lock_shared = TRUE;
4181         } else
4182                 vm_object_lock(src_object);
4183
4184         /*
4185          *      The copy strategy is only valid if the memory manager
4186          *      is "ready". Internal objects are always ready.
4187          */
4188
4189         while (!src_object->internal && !src_object->pager_ready) {
4190                 wait_result_t wait_result;
4191
4192                 if (object_lock_shared == TRUE) {
4193                         vm_object_unlock(src_object);
4194                         vm_object_lock(src_object);
4195                         object_lock_shared = FALSE;
4196                         continue;
4197                 }
4198                 wait_result = vm_object_sleep(  src_object,
4199                                                 VM_OBJECT_EVENT_PAGER_READY,
4200                                                 interruptible);
4201                 if (wait_result != THREAD_AWAKENED) {
4202                         vm_object_unlock(src_object);
4203                         *dst_object = VM_OBJECT_NULL;
4204                         *dst_offset = 0;
4205                         *dst_needs_copy = FALSE;
4206                         return(MACH_SEND_INTERRUPTED);
4207                 }
4208         }
4209
4210         /*
4211          *      Use the appropriate copy strategy.
4212          */
4213
4214         switch (copy_strategy) {
4215             case MEMORY_OBJECT_COPY_DELAY:
4216                 *dst_object = vm_object_copy_delayed(src_object,
4217                                                      src_offset, size, object_lock_shared);
4218                 if (*dst_object != VM_OBJECT_NULL) {
4219                         *dst_offset = src_offset;
4220                         *dst_needs_copy = TRUE;
4221                         result = KERN_SUCCESS;
4222                         break;
4223                 }
4224                 vm_object_lock(src_object);
4225                 /* fall thru when delayed copy not allowed */
4226
4227             case MEMORY_OBJECT_COPY_NONE:
4228                 result = vm_object_copy_slowly(src_object, src_offset, size,
4229                                                interruptible, dst_object);
4230                 if (result == KERN_SUCCESS) {
4231                         *dst_offset = 0;
4232                         *dst_needs_copy = FALSE;
4233                 }
4234                 break;
4235
4236             case MEMORY_OBJECT_COPY_CALL:
4237                 result = vm_object_copy_call(src_object, src_offset, size,
4238                                 dst_object);
4239                 if (result == KERN_SUCCESS) {
4240                         *dst_offset = src_offset;
4241                         *dst_needs_copy = TRUE;
4242                 }
4243                 break;
4244
4245             case MEMORY_OBJECT_COPY_SYMMETRIC:
4246                 XPR(XPR_VM_OBJECT, "v_o_c_strategically obj 0x%x off 0x%x size 0x%x\n", src_object, src_offset, size, 0, 0);
4247                 vm_object_unlock(src_object);
4248                 result = KERN_MEMORY_RESTART_COPY;
4249                 break;
4250
4251             default:
4252                 panic("copy_strategically: bad strategy");
4253                 result = KERN_INVALID_ARGUMENT;
4254         }
4255         return(result);
4256 }
4257
4258 /*
4259  *      vm_object_shadow:
4260  *
4261  *      Create a new object which is backed by the
4262  *      specified existing object range.  The source
4263  *      object reference is deallocated.
4264  *
4265  *      The new object and offset into that object
4266  *      are returned in the source parameters.
4267  */
4268 boolean_t vm_object_shadow_check = TRUE;
4269
4270 __private_extern__ boolean_t
4271 vm_object_shadow(
4272         vm_object_t             *object,        /* IN/OUT */
4273         vm_object_offset_t      *offset,        /* IN/OUT */
4274         vm_object_size_t        length)
4275 {
4276         register vm_object_t    source;
4277         register vm_object_t    result;
4278
4279         source = *object;
4280         assert(source != VM_OBJECT_NULL);
4281         if (source == VM_OBJECT_NULL)
4282                 return FALSE;
4283
4284 #if 0
4285         /*
4286          * XXX FBDP
4287          * This assertion is valid but it gets triggered by Rosetta for example
4288          * due to a combination of vm_remap() that changes a VM object's
4289          * copy_strategy from SYMMETRIC to DELAY and vm_protect(VM_PROT_COPY)
4290          * that then sets "needs_copy" on its map entry.  This creates a
4291          * mapping situation that VM should never see and doesn't know how to
4292          * handle.
4293          * It's not clear if this can create any real problem but we should
4294          * look into fixing this, probably by having vm_protect(VM_PROT_COPY)
4295          * do more than just set "needs_copy" to handle the copy-on-write...
4296          * In the meantime, let's disable the assertion.
4297          */
4298         assert(source->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4299 #endif
4300
4301         /*
4302          *      Determine if we really need a shadow.
4303          *
4304          *      If the source object is larger than what we are trying
4305          *      to create, then force the shadow creation even if the
4306          *      ref count is 1.  This will allow us to [potentially]
4307          *      collapse the underlying object away in the future
4308          *      (freeing up the extra data it might contain and that
4309          *      we don't need).
4310          */
4311         if (vm_object_shadow_check &&
4312             source->vo_size == length &&
4313             source->ref_count == 1 &&
4314             (source->shadow == VM_OBJECT_NULL ||
4315              source->shadow->copy == VM_OBJECT_NULL) )
4316         {
4317                 source->shadowed = FALSE;
4318                 return FALSE;
4319         }
4320
4321         /*
4322          *      Allocate a new object with the given length
4323          */
4324
4325         if ((result = vm_object_allocate(length)) == VM_OBJECT_NULL)
4326                 panic("vm_object_shadow: no object for shadowing");
4327
4328         /*
4329          *      The new object shadows the source object, adding
4330          *      a reference to it.  Our caller changes his reference
4331          *      to point to the new object, removing a reference to
4332          *      the source object.  Net result: no change of reference
4333          *      count.
4334          */
4335         result->shadow = source;
4336
4337         /*
4338          *      Store the offset into the source object,
4339          *      and fix up the offset into the new object.
4340          */
4341
4342         result->vo_shadow_offset = *offset;
4343
4344         /*
4345          *      Return the new things
4346          */
4347
4348         *offset = 0;
4349         *object = result;
4350         return TRUE;
4351 }
4352
4353 /*
4354  *      The relationship between vm_object structures and
4355  *      the memory_object requires careful synchronization.
4356  *
4357  *      All associations are created by memory_object_create_named
4358  *  for external pagers and vm_object_pager_create for internal
4359  *  objects as follows:
4360  *
4361  *              pager:  the memory_object itself, supplied by
4362  *                      the user requesting a mapping (or the kernel,
4363  *                      when initializing internal objects); the
4364  *                      kernel simulates holding send rights by keeping
4365  *                      a port reference;
4366  *
4367  *              pager_request:
4368  *                      the memory object control port,
4369  *                      created by the kernel; the kernel holds
4370  *                      receive (and ownership) rights to this
4371  *                      port, but no other references.
4372  *
4373  *      When initialization is complete, the "initialized" field
4374  *      is asserted.  Other mappings using a particular memory object,
4375  *      and any references to the vm_object gained through the
4376  *      port association must wait for this initialization to occur.
4377  *
4378  *      In order to allow the memory manager to set attributes before
4379  *      requests (notably virtual copy operations, but also data or
4380  *      unlock requests) are made, a "ready" attribute is made available.
4381  *      Only the memory manager may affect the value of this attribute.
4382  *      Its value does not affect critical kernel functions, such as
4383  *      internal object initialization or destruction.  [Furthermore,
4384  *      memory objects created by the kernel are assumed to be ready
4385  *      immediately; the default memory manager need not explicitly
4386  *      set the "ready" attribute.]
4387  *
4388  *      [Both the "initialized" and "ready" attribute wait conditions
4389  *      use the "pager" field as the wait event.]
4390  *
4391  *      The port associations can be broken down by any of the
4392  *      following routines:
4393  *              vm_object_terminate:
4394  *                      No references to the vm_object remain, and
4395  *                      the object cannot (or will not) be cached.
4396  *                      This is the normal case, and is done even
4397  *                      though one of the other cases has already been
4398  *                      done.
4399  *              memory_object_destroy:
4400  *                      The memory manager has requested that the
4401  *                      kernel relinquish references to the memory
4402  *                      object. [The memory manager may not want to
4403  *                      destroy the memory object, but may wish to
4404  *                      refuse or tear down existing memory mappings.]
4405  *
4406  *      Each routine that breaks an association must break all of
4407  *      them at once.  At some later time, that routine must clear
4408  *      the pager field and release the memory object references.
4409  *      [Furthermore, each routine must cope with the simultaneous
4410  *      or previous operations of the others.]
4411  *
4412  *      In addition to the lock on the object, the vm_object_hash_lock
4413  *      governs the associations.  References gained through the
4414  *      association require use of the hash lock.
4415  *
4416  *      Because the pager field may be cleared spontaneously, it
4417  *      cannot be used to determine whether a memory object has
4418  *      ever been associated with a particular vm_object.  [This
4419  *      knowledge is important to the shadow object mechanism.]
4420  *      For this reason, an additional "created" attribute is
4421  *      provided.
4422  *
4423  *      During various paging operations, the pager reference found in the
4424  *      vm_object must be valid.  To prevent this from being released,
4425  *      (other than being removed, i.e., made null), routines may use
4426  *      the vm_object_paging_begin/end routines [actually, macros].
4427  *      The implementation uses the "paging_in_progress" and "wanted" fields.
4428  *      [Operations that alter the validity of the pager values include the
4429  *      termination routines and vm_object_collapse.]
4430  */
4431
4432
4433 /*
4434  *      Routine:        vm_object_enter
4435  *      Purpose:
4436  *              Find a VM object corresponding to the given
4437  *              pager; if no such object exists, create one,
4438  *              and initialize the pager.
4439  */
4440 vm_object_t
4441 vm_object_enter(
4442         memory_object_t         pager,
4443         vm_object_size_t        size,
4444         boolean_t               internal,
4445         boolean_t               init,
4446         boolean_t               named)
4447 {
4448         register vm_object_t    object;
4449         vm_object_t             new_object;
4450         boolean_t               must_init;
4451         vm_object_hash_entry_t  entry, new_entry;
4452         uint32_t        try_failed_count = 0;
4453         lck_mtx_t       *lck;
4454
4455         if (pager == MEMORY_OBJECT_NULL)
4456                 return(vm_object_allocate(size));
4457
4458         new_object = VM_OBJECT_NULL;
4459         new_entry = VM_OBJECT_HASH_ENTRY_NULL;
4460         must_init = init;
4461
4462         /*
4463          *      Look for an object associated with this port.
4464          */
4465 Retry:
4466         lck = vm_object_hash_lock_spin(pager);
4467         do {
4468                 entry = vm_object_hash_lookup(pager, FALSE);
4469
4470                 if (entry == VM_OBJECT_HASH_ENTRY_NULL) {
4471                         if (new_object == VM_OBJECT_NULL) {
4472                                 /*
4473                                  *      We must unlock to create a new object;
4474                                  *      if we do so, we must try the lookup again.
4475                                  */
4476                                 vm_object_hash_unlock(lck);
4477                                 assert(new_entry == VM_OBJECT_HASH_ENTRY_NULL);
4478                                 new_entry = vm_object_hash_entry_alloc(pager);
4479                                 new_object = vm_object_allocate(size);
4480                                 lck = vm_object_hash_lock_spin(pager);
4481                         } else {
4482                                 /*
4483                                  *      Lookup failed twice, and we have something
4484                                  *      to insert; set the object.
4485                                  */
4486                                 vm_object_lock(new_object);
4487                                 vm_object_hash_insert(new_entry, new_object);
4488                                 vm_object_unlock(new_object);
4489                                 entry = new_entry;
4490                                 new_entry = VM_OBJECT_HASH_ENTRY_NULL;
4491                                 new_object = VM_OBJECT_NULL;
4492                                 must_init = TRUE;
4493                         }
4494                 } else if (entry->object == VM_OBJECT_NULL) {
4495                         /*
4496                          *      If a previous object is being terminated,
4497                          *      we must wait for the termination message
4498                          *      to be queued (and lookup the entry again).
4499                          */
4500                         entry->waiting = TRUE;
4501                         entry = VM_OBJECT_HASH_ENTRY_NULL;
4502                         assert_wait((event_t) pager, THREAD_UNINT);
4503                         vm_object_hash_unlock(lck);
4504
4505                         thread_block(THREAD_CONTINUE_NULL);
4506                         lck = vm_object_hash_lock_spin(pager);
4507                 }
4508         } while (entry == VM_OBJECT_HASH_ENTRY_NULL);
4509
4510         object = entry->object;
4511         assert(object != VM_OBJECT_NULL);
4512
4513         if (!must_init) {
4514                 if ( !vm_object_lock_try(object)) {
4515
4516                         vm_object_hash_unlock(lck);
4517
4518                         try_failed_count++;
4519                         mutex_pause(try_failed_count);  /* wait a bit */
4520                         goto Retry;
4521                 }
4522                 assert(!internal || object->internal);
4523 #if VM_OBJECT_CACHE
4524                 if (object->ref_count == 0) {
4525                         if ( !vm_object_cache_lock_try()) {
4526
4527                                 vm_object_hash_unlock(lck);
4528                                 vm_object_unlock(object);
4529
4530                                 try_failed_count++;
4531                                 mutex_pause(try_failed_count);  /* wait a bit */
4532                                 goto Retry;
4533                         }
4534                         XPR(XPR_VM_OBJECT_CACHE,
4535                             "vm_object_enter: removing %x from cache, head (%x, %x)\n",
4536                                 object,
4537                                 vm_object_cached_list.next,
4538                                 vm_object_cached_list.prev, 0,0);
4539                         queue_remove(&vm_object_cached_list, object,
4540                                      vm_object_t, cached_list);
4541                         vm_object_cached_count--;
4542
4543                         vm_object_cache_unlock();
4544                 }
4545 #endif
4546                 if (named) {
4547                         assert(!object->named);
4548                         object->named = TRUE;
4549                 }
4550                 vm_object_lock_assert_exclusive(object);
4551                 object->ref_count++;
4552                 vm_object_res_reference(object);
4553
4554                 vm_object_hash_unlock(lck);
4555                 vm_object_unlock(object);
4556
4557                 VM_STAT_INCR(hits);
4558         } else
4559                 vm_object_hash_unlock(lck);
4560
4561         assert(object->ref_count > 0);
4562
4563         VM_STAT_INCR(lookups);
4564
4565         XPR(XPR_VM_OBJECT,
4566                 "vm_o_enter: pager 0x%x obj 0x%x must_init %d\n",
4567                 pager, object, must_init, 0, 0);
4568
4569         /*
4570          *      If we raced to create a vm_object but lost, let's
4571          *      throw away ours.
4572          */
4573
4574         if (new_object != VM_OBJECT_NULL)
4575                 vm_object_deallocate(new_object);
4576
4577         if (new_entry != VM_OBJECT_HASH_ENTRY_NULL)
4578                 vm_object_hash_entry_free(new_entry);
4579
4580         if (must_init) {
4581                 memory_object_control_t control;
4582
4583                 /*
4584                  *      Allocate request port.
4585                  */
4586
4587                 control = memory_object_control_allocate(object);
4588                 assert (control != MEMORY_OBJECT_CONTROL_NULL);
4589
4590                 vm_object_lock(object);
4591                 assert(object != kernel_object);
4592
4593                 /*
4594                  *      Copy the reference we were given.
4595                  */
4596
4597                 memory_object_reference(pager);
4598                 object->pager_created = TRUE;
4599                 object->pager = pager;
4600                 object->internal = internal;
4601                 object->pager_trusted = internal;
4602                 if (!internal) {
4603                         /* copy strategy invalid until set by memory manager */
4604                         object->copy_strategy = MEMORY_OBJECT_COPY_INVALID;
4605                 }
4606                 object->pager_control = control;
4607                 object->pager_ready = FALSE;
4608
4609                 vm_object_unlock(object);
4610
4611                 /*
4612                  *      Let the pager know we're using it.
4613                  */
4614
4615                 (void) memory_object_init(pager,
4616                         object->pager_control,
4617                         PAGE_SIZE);
4618
4619                 vm_object_lock(object);
4620                 if (named)
4621                         object->named = TRUE;
4622                 if (internal) {
4623                         object->pager_ready = TRUE;
4624                         vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY);
4625                 }
4626
4627                 object->pager_initialized = TRUE;
4628                 vm_object_wakeup(object, VM_OBJECT_EVENT_INITIALIZED);
4629         } else {
4630                 vm_object_lock(object);
4631         }
4632
4633         /*
4634          *      [At this point, the object must be locked]
4635          */
4636
4637         /*
4638          *      Wait for the work above to be done by the first
4639          *      thread to map this object.
4640          */
4641
4642         while (!object->pager_initialized) {
4643                 vm_object_sleep(object,
4644                                 VM_OBJECT_EVENT_INITIALIZED,
4645                                 THREAD_UNINT);
4646         }
4647         vm_object_unlock(object);
4648
4649         XPR(XPR_VM_OBJECT,
4650             "vm_object_enter: vm_object %x, memory_object %x, internal %d\n",
4651             object, object->pager, internal, 0,0);
4652         return(object);
4653 }
4654
4655 /*
4656  *      Routine:        vm_object_pager_create
4657  *      Purpose:
4658  *              Create a memory object for an internal object.
4659  *      In/out conditions:
4660  *              The object is locked on entry and exit;
4661  *              it may be unlocked within this call.
4662  *      Limitations:
4663  *              Only one thread may be performing a
4664  *              vm_object_pager_create on an object at
4665  *              a time.  Presumably, only the pageout
4666  *              daemon will be using this routine.
4667  */
4668
4669 void
4670 vm_object_pager_create(
4671         register vm_object_t    object)
4672 {
4673         memory_object_t         pager;
4674         vm_object_hash_entry_t  entry;
4675         lck_mtx_t               *lck;
4676 #if     MACH_PAGEMAP
4677         vm_object_size_t        size;
4678         vm_external_map_t       map;
4679 #endif  /* MACH_PAGEMAP */
4680
4681         XPR(XPR_VM_OBJECT, "vm_object_pager_create, object 0x%X\n",
4682                 object, 0,0,0,0);
4683
4684         assert(object != kernel_object);
4685
4686         if (memory_manager_default_check() != KERN_SUCCESS)
4687                 return;
4688
4689         /*
4690          *      Prevent collapse or termination by holding a paging reference
4691          */
4692
4693         vm_object_paging_begin(object);
4694         if (object->pager_created) {
4695                 /*
4696                  *      Someone else got to it first...
4697                  *      wait for them to finish initializing the ports
4698                  */
4699                 while (!object->pager_initialized) {
4700                         vm_object_sleep(object,
4701                                         VM_OBJECT_EVENT_INITIALIZED,
4702                                         THREAD_UNINT);
4703                 }
4704                 vm_object_paging_end(object);
4705                 return;
4706         }
4707
4708         /*
4709          *      Indicate that a memory object has been assigned
4710          *      before dropping the lock, to prevent a race.
4711          */
4712
4713         object->pager_created = TRUE;
4714         object->paging_offset = 0;
4715
4716 #if     MACH_PAGEMAP
4717         size = object->vo_size;
4718 #endif  /* MACH_PAGEMAP */
4719         vm_object_unlock(object);
4720
4721 #if     MACH_PAGEMAP
4722         if (DEFAULT_PAGER_IS_ACTIVE) {
4723                 map = vm_external_create(size);
4724                 vm_object_lock(object);
4725                 assert(object->vo_size == size);
4726                 object->existence_map = map;
4727                 vm_object_unlock(object);
4728         }
4729 #endif  /* MACH_PAGEMAP */
4730
4731         if ((uint32_t) object->vo_size != object->vo_size) {
4732                 panic("vm_object_pager_create(): object size 0x%llx >= 4GB\n",
4733                       (uint64_t) object->vo_size);
4734         }
4735
4736         /*
4737          *      Create the [internal] pager, and associate it with this object.
4738          *
4739          *      We make the association here so that vm_object_enter()
4740          *      can look up the object to complete initializing it.  No
4741          *      user will ever map this object.
4742          */
4743         {
4744                 memory_object_default_t         dmm;
4745
4746                 /* acquire a reference for the default memory manager */
4747                 dmm = memory_manager_default_reference();
4748
4749                 assert(object->temporary);
4750
4751                 /* create our new memory object */
4752                 assert((vm_size_t) object->vo_size == object->vo_size);
4753                 (void) memory_object_create(dmm, (vm_size_t) object->vo_size,
4754                                             &pager);
4755
4756                 memory_object_default_deallocate(dmm);
4757        }
4758
4759         entry = vm_object_hash_entry_alloc(pager);
4760
4761         vm_object_lock(object);
4762         lck = vm_object_hash_lock_spin(pager);
4763         vm_object_hash_insert(entry, object);
4764         vm_object_hash_unlock(lck);
4765         vm_object_unlock(object);
4766
4767         /*
4768          *      A reference was returned by
4769          *      memory_object_create(), and it is
4770          *      copied by vm_object_enter().
4771          */
4772
4773         if (vm_object_enter(pager, object->vo_size, TRUE, TRUE, FALSE) != object)
4774                 panic("vm_object_pager_create: mismatch");
4775
4776         /*
4777          *      Drop the reference we were passed.
4778          */
4779         memory_object_deallocate(pager);
4780
4781         vm_object_lock(object);
4782
4783         /*
4784          *      Release the paging reference
4785          */
4786         vm_object_paging_end(object);
4787 }
4788
4789 void
4790 vm_object_compressor_pager_create(
4791         register vm_object_t    object)
4792 {
4793         memory_object_t         pager;
4794         vm_object_hash_entry_t  entry;
4795         lck_mtx_t               *lck;
4796         vm_object_t             pager_object = VM_OBJECT_NULL;
4797
4798         assert(object != kernel_object);
4799
4800         /*
4801          *      Prevent collapse or termination by holding a paging reference
4802          */
4803
4804         vm_object_paging_begin(object);
4805         if (object->pager_created) {
4806                 /*
4807                  *      Someone else got to it first...
4808                  *      wait for them to finish initializing the ports
4809                  */
4810                 while (!object->pager_initialized) {
4811                         vm_object_sleep(object,
4812                                         VM_OBJECT_EVENT_INITIALIZED,
4813                                         THREAD_UNINT);
4814                 }
4815                 vm_object_paging_end(object);
4816                 return;
4817         }
4818
4819         /*
4820          *      Indicate that a memory object has been assigned
4821          *      before dropping the lock, to prevent a race.
4822          */
4823
4824         object->pager_created = TRUE;
4825         object->paging_offset = 0;
4826
4827         vm_object_unlock(object);
4828
4829         if ((uint32_t) (object->vo_size/PAGE_SIZE) !=
4830             (object->vo_size/PAGE_SIZE)) {
4831                 panic("vm_object_compressor_pager_create(%p): "
4832                       "object size 0x%llx >= 0x%llx\n",
4833                       object,
4834                       (uint64_t) object->vo_size,
4835                       0x0FFFFFFFFULL*PAGE_SIZE);
4836         }
4837
4838         /*
4839          *      Create the [internal] pager, and associate it with this object.
4840          *
4841          *      We make the association here so that vm_object_enter()
4842          *      can look up the object to complete initializing it.  No
4843          *      user will ever map this object.
4844          */
4845         {
4846                 assert(object->temporary);
4847
4848                 /* create our new memory object */
4849                 assert((uint32_t) (object->vo_size/PAGE_SIZE) ==
4850                        (object->vo_size/PAGE_SIZE));
4851                 (void) compressor_memory_object_create(
4852                         (memory_object_size_t) object->vo_size,
4853                         &pager);
4854                 if (pager == NULL) {
4855                         panic("vm_object_compressor_pager_create(): "
4856                               "no pager for object %p size 0x%llx\n",
4857                               object, (uint64_t) object->vo_size);
4858                 }
4859        }
4860
4861         entry = vm_object_hash_entry_alloc(pager);
4862
4863         vm_object_lock(object);
4864         lck = vm_object_hash_lock_spin(pager);
4865         vm_object_hash_insert(entry, object);
4866         vm_object_hash_unlock(lck);
4867         vm_object_unlock(object);
4868
4869         /*
4870          *      A reference was returned by
4871          *      memory_object_create(), and it is
4872          *      copied by vm_object_enter().
4873          */
4874
4875         pager_object = vm_object_enter(pager, object->vo_size, TRUE, TRUE, FALSE);
4876
4877         if (pager_object != object) {
4878                 panic("vm_object_compressor_pager_create: mismatch (pager: %p, pager_object: %p, orig_object: %p, orig_object size: 0x%llx)\n", pager, pager_object, object, (uint64_t) object->vo_size);
4879         }
4880
4881         /*
4882          *      Drop the reference we were passed.
4883          */
4884         memory_object_deallocate(pager);
4885
4886         vm_object_lock(object);
4887
4888         /*
4889          *      Release the paging reference
4890          */
4891         vm_object_paging_end(object);
4892 }
4893
4894 /*
4895  *      Routine:        vm_object_remove
4896  *      Purpose:
4897  *              Eliminate the pager/object association
4898  *              for this pager.
4899  *      Conditions:
4900  *              The object cache must be locked.
4901  */
4902 __private_extern__ void
4903 vm_object_remove(
4904         vm_object_t     object)
4905 {
4906         memory_object_t pager;
4907
4908         if ((pager = object->pager) != MEMORY_OBJECT_NULL) {
4909                 vm_object_hash_entry_t  entry;
4910
4911                 entry = vm_object_hash_lookup(pager, FALSE);
4912                 if (entry != VM_OBJECT_HASH_ENTRY_NULL)
4913                         entry->object = VM_OBJECT_NULL;
4914         }
4915
4916 }
4917
4918 /*
4919  *      Global variables for vm_object_collapse():
4920  *
4921  *              Counts for normal collapses and bypasses.
4922  *              Debugging variables, to watch or disable collapse.
4923  */
4924 static long     object_collapses = 0;
4925 static long     object_bypasses  = 0;
4926
4927 static boolean_t        vm_object_collapse_allowed = TRUE;
4928 static boolean_t        vm_object_bypass_allowed = TRUE;
4929
4930 #if MACH_PAGEMAP
4931 static int      vm_external_discarded;
4932 static int      vm_external_collapsed;
4933 #endif
4934
4935 unsigned long vm_object_collapse_encrypted = 0;
4936
4937 void vm_object_do_collapse_compressor(vm_object_t object,
4938                                       vm_object_t backing_object);
4939 void
4940 vm_object_do_collapse_compressor(
4941         vm_object_t object,
4942         vm_object_t backing_object)
4943 {
4944         vm_object_offset_t new_offset, backing_offset;
4945         vm_object_size_t size;
4946
4947         vm_counters.do_collapse_compressor++;
4948
4949         vm_object_lock_assert_exclusive(object);
4950         vm_object_lock_assert_exclusive(backing_object);
4951
4952         size = object->vo_size;
4953
4954         /*
4955          *      Move all compressed pages from backing_object
4956          *      to the parent.
4957          */
4958
4959         for (backing_offset = object->vo_shadow_offset;
4960              backing_offset < object->vo_shadow_offset + object->vo_size;
4961              backing_offset += PAGE_SIZE) {
4962                 memory_object_offset_t backing_pager_offset;
4963
4964                 /* find the next compressed page at or after this offset */
4965                 backing_pager_offset = (backing_offset +
4966                                         backing_object->paging_offset);
4967                 backing_pager_offset = vm_compressor_pager_next_compressed(
4968                         backing_object->pager,
4969                         backing_pager_offset);
4970                 if (backing_pager_offset == (memory_object_offset_t) -1) {
4971                         /* no more compressed pages */
4972                         break;
4973                 }
4974                 backing_offset = (backing_pager_offset -
4975                                   backing_object->paging_offset);
4976
4977                 new_offset = backing_offset - object->vo_shadow_offset;
4978
4979                 if (new_offset >= object->vo_size) {
4980                         /* we're out of the scope of "object": done */
4981                         break;
4982                 }
4983
4984                 if ((vm_page_lookup(object, new_offset) != VM_PAGE_NULL) ||
4985                     (vm_compressor_pager_state_get(object->pager,
4986                                                    (new_offset +
4987                                                     object->paging_offset)) ==
4988                      VM_EXTERNAL_STATE_EXISTS)) {
4989                         /*
4990                          * This page already exists in object, resident or
4991                          * compressed.
4992                          * We don't need this compressed page in backing_object
4993                          * and it will be reclaimed when we release
4994                          * backing_object.
4995                          */
4996                         continue;
4997                 }
4998
4999                 /*
5000                  * backing_object has this page in the VM compressor and
5001                  * we need to transfer it to object.
5002                  */
5003                 vm_counters.do_collapse_compressor_pages++;
5004                 vm_compressor_pager_transfer(
5005                         /* destination: */
5006                         object->pager,
5007                         (new_offset + object->paging_offset),
5008                         /* source: */
5009                         backing_object->pager,
5010                         (backing_offset + backing_object->paging_offset));
5011         }
5012 }
5013
5014 /*
5015  *      Routine:        vm_object_do_collapse
5016  *      Purpose:
5017  *              Collapse an object with the object backing it.
5018  *              Pages in the backing object are moved into the
5019  *              parent, and the backing object is deallocated.
5020  *      Conditions:
5021  *              Both objects and the cache are locked; the page
5022  *              queues are unlocked.
5023  *
5024  */
5025 static void
5026 vm_object_do_collapse(
5027         vm_object_t object,
5028         vm_object_t backing_object)
5029 {
5030         vm_page_t p, pp;
5031         vm_object_offset_t new_offset, backing_offset;
5032         vm_object_size_t size;
5033
5034         vm_object_lock_assert_exclusive(object);
5035         vm_object_lock_assert_exclusive(backing_object);
5036
5037         assert(object->purgable == VM_PURGABLE_DENY);
5038         assert(backing_object->purgable == VM_PURGABLE_DENY);
5039
5040         backing_offset = object->vo_shadow_offset;
5041         size = object->vo_size;
5042
5043         /*
5044          *      Move all in-memory pages from backing_object
5045          *      to the parent.  Pages that have been paged out
5046          *      will be overwritten by any of the parent's
5047          *      pages that shadow them.
5048          */
5049
5050         while (!queue_empty(&backing_object->memq)) {
5051
5052                 p = (vm_page_t) queue_first(&backing_object->memq);
5053
5054                 new_offset = (p->offset - backing_offset);
5055
5056                 assert(!p->busy || p->absent);
5057
5058                 /*
5059                  *      If the parent has a page here, or if
5060                  *      this page falls outside the parent,
5061                  *      dispose of it.
5062                  *
5063                  *      Otherwise, move it as planned.
5064                  */
5065
5066                 if (p->offset < backing_offset || new_offset >= size) {
5067                         VM_PAGE_FREE(p);
5068                 } else {
5069                         /*
5070                          * ENCRYPTED SWAP:
5071                          * The encryption key includes the "pager" and the
5072                          * "paging_offset".  These will not change during the
5073                          * object collapse, so we can just move an encrypted
5074                          * page from one object to the other in this case.
5075                          * We can't decrypt the page here, since we can't drop
5076                          * the object lock.
5077                          */
5078                         if (p->encrypted) {
5079                                 vm_object_collapse_encrypted++;
5080                         }
5081                         pp = vm_page_lookup(object, new_offset);
5082                         if (pp == VM_PAGE_NULL) {
5083
5084                                 if (VM_COMPRESSOR_PAGER_STATE_GET(object,
5085                                                                   new_offset)
5086                                     == VM_EXTERNAL_STATE_EXISTS) {
5087                                         /*
5088                                          * Parent object has this page
5089                                          * in the VM compressor.
5090                                          * Throw away the backing
5091                                          * object's page.
5092                                          */
5093                                         VM_PAGE_FREE(p);
5094                                 } else {
5095                                         /*
5096                                          *      Parent now has no page.
5097                                          *      Move the backing object's page
5098                                          *      up.
5099                                          */
5100                                         vm_page_rename(p, object, new_offset,
5101                                                        TRUE);
5102                                 }
5103
5104 #if     MACH_PAGEMAP
5105                         } else if (pp->absent) {
5106
5107                                 /*
5108                                  *      Parent has an absent page...
5109                                  *      it's not being paged in, so
5110                                  *      it must really be missing from
5111                                  *      the parent.
5112                                  *
5113                                  *      Throw out the absent page...
5114                                  *      any faults looking for that
5115                                  *      page will restart with the new
5116                                  *      one.
5117                                  */
5118
5119                                 VM_PAGE_FREE(pp);
5120                                 vm_page_rename(p, object, new_offset, TRUE);
5121 #endif  /* MACH_PAGEMAP */
5122                         } else {
5123                                 assert(! pp->absent);
5124
5125                                 /*
5126                                  *      Parent object has a real page.
5127                                  *      Throw away the backing object's
5128                                  *      page.
5129                                  */
5130                                 VM_PAGE_FREE(p);
5131                         }
5132                 }
5133         }
5134
5135         if (vm_object_collapse_compressor_allowed &&
5136             object->pager != MEMORY_OBJECT_NULL &&
5137             backing_object->pager != MEMORY_OBJECT_NULL) {
5138
5139                 /* move compressed pages from backing_object to object */
5140                 vm_object_do_collapse_compressor(object, backing_object);
5141
5142         } else if (backing_object->pager != MEMORY_OBJECT_NULL) {
5143                 vm_object_hash_entry_t  entry;
5144
5145 #if     !MACH_PAGEMAP
5146                 assert((!object->pager_created &&
5147                         (object->pager == MEMORY_OBJECT_NULL)) ||
5148                        (!backing_object->pager_created &&
5149                         (backing_object->pager == MEMORY_OBJECT_NULL)));
5150 #else
5151                 assert(!object->pager_created &&
5152                        object->pager == MEMORY_OBJECT_NULL);
5153 #endif  /* !MACH_PAGEMAP */
5154
5155                 /*
5156                  *      Move the pager from backing_object to object.
5157                  *
5158                  *      XXX We're only using part of the paging space
5159                  *      for keeps now... we ought to discard the
5160                  *      unused portion.
5161                  */
5162
5163                 assert(!object->paging_in_progress);
5164                 assert(!object->activity_in_progress);
5165                 assert(!object->pager_created);
5166                 assert(object->pager == NULL);
5167                 object->pager = backing_object->pager;
5168
5169                 if (backing_object->hashed) {
5170                         lck_mtx_t       *lck;
5171
5172                         lck = vm_object_hash_lock_spin(backing_object->pager);
5173                         entry = vm_object_hash_lookup(object->pager, FALSE);
5174                         assert(entry != VM_OBJECT_HASH_ENTRY_NULL);
5175                         entry->object = object;
5176                         vm_object_hash_unlock(lck);
5177
5178                         object->hashed = TRUE;
5179                 }
5180                 object->pager_created = backing_object->pager_created;
5181                 object->pager_control = backing_object->pager_control;
5182                 object->pager_ready = backing_object->pager_ready;
5183                 object->pager_initialized = backing_object->pager_initialized;
5184                 object->paging_offset =
5185                     backing_object->paging_offset + backing_offset;
5186                 if (object->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
5187                         memory_object_control_collapse(object->pager_control,
5188                                                        object);
5189                 }
5190                 /* the backing_object has lost its pager: reset all fields */
5191                 backing_object->pager_created = FALSE;
5192                 backing_object->pager_control = NULL;
5193                 backing_object->pager_ready = FALSE;
5194                 backing_object->paging_offset = 0;
5195                 backing_object->pager = NULL;
5196         }
5197
5198 #if     MACH_PAGEMAP
5199         /*
5200          *      If the shadow offset is 0, the use the existence map from
5201          *      the backing object if there is one. If the shadow offset is
5202          *      not zero, toss it.
5203          *
5204          *      XXX - If the shadow offset is not 0 then a bit copy is needed
5205          *      if the map is to be salvaged.  For now, we just just toss the
5206          *      old map, giving the collapsed object no map. This means that
5207          *      the pager is invoked for zero fill pages.  If analysis shows
5208          *      that this happens frequently and is a performance hit, then
5209          *      this code should be fixed to salvage the map.
5210          */
5211         assert(object->existence_map == VM_EXTERNAL_NULL);
5212         if (backing_offset || (size != backing_object->vo_size)) {
5213                 vm_external_discarded++;
5214                 vm_external_destroy(backing_object->existence_map,
5215                         backing_object->vo_size);
5216         }
5217         else {
5218                 vm_external_collapsed++;
5219                 object->existence_map = backing_object->existence_map;
5220         }
5221         backing_object->existence_map = VM_EXTERNAL_NULL;
5222 #endif  /* MACH_PAGEMAP */
5223
5224         /*
5225          *      Object now shadows whatever backing_object did.
5226          *      Note that the reference to backing_object->shadow
5227          *      moves from within backing_object to within object.
5228          */
5229
5230         assert(!object->phys_contiguous);
5231         assert(!backing_object->phys_contiguous);
5232         object->shadow = backing_object->shadow;
5233         if (object->shadow) {
5234                 object->vo_shadow_offset += backing_object->vo_shadow_offset;
5235                 /* "backing_object" gave its shadow to "object" */
5236                 backing_object->shadow = VM_OBJECT_NULL;
5237                 backing_object->vo_shadow_offset = 0;
5238         } else {
5239                 /* no shadow, therefore no shadow offset... */
5240                 object->vo_shadow_offset = 0;
5241         }
5242         assert((object->shadow == VM_OBJECT_NULL) ||
5243                (object->shadow->copy != backing_object));
5244
5245         /*
5246          *      Discard backing_object.
5247          *
5248          *      Since the backing object has no pages, no
5249          *      pager left, and no object references within it,
5250          *      all that is necessary is to dispose of it.
5251          */
5252         object_collapses++;
5253
5254         assert(backing_object->ref_count == 1);
5255         assert(backing_object->resident_page_count == 0);
5256         assert(backing_object->paging_in_progress == 0);
5257         assert(backing_object->activity_in_progress == 0);
5258         assert(backing_object->shadow == VM_OBJECT_NULL);
5259         assert(backing_object->vo_shadow_offset == 0);
5260
5261         if (backing_object->pager != MEMORY_OBJECT_NULL) {
5262                 /* ... unless it has a pager; need to terminate pager too */
5263                 vm_counters.do_collapse_terminate++;
5264                 if (vm_object_terminate(backing_object) != KERN_SUCCESS) {
5265                         vm_counters.do_collapse_terminate_failure++;
5266                 }
5267                 return;
5268         }
5269
5270         assert(backing_object->pager == NULL);
5271
5272         backing_object->alive = FALSE;
5273         vm_object_unlock(backing_object);
5274
5275         XPR(XPR_VM_OBJECT, "vm_object_collapse, collapsed 0x%X\n",
5276                 backing_object, 0,0,0,0);
5277
5278 #if VM_OBJECT_TRACKING
5279         if (vm_object_tracking_inited) {
5280                 btlog_remove_entries_for_element(vm_object_tracking_btlog,
5281                                                  backing_object);
5282         }
5283 #endif /* VM_OBJECT_TRACKING */
5284
5285         vm_object_lock_destroy(backing_object);
5286
5287         zfree(vm_object_zone, backing_object);
5288
5289 }
5290
5291 static void
5292 vm_object_do_bypass(
5293         vm_object_t object,
5294         vm_object_t backing_object)
5295 {
5296         /*
5297          *      Make the parent shadow the next object
5298          *      in the chain.
5299          */
5300
5301         vm_object_lock_assert_exclusive(object);
5302         vm_object_lock_assert_exclusive(backing_object);
5303
5304 #if     TASK_SWAPPER
5305         /*
5306          *      Do object reference in-line to
5307          *      conditionally increment shadow's
5308          *      residence count.  If object is not
5309          *      resident, leave residence count
5310          *      on shadow alone.
5311          */
5312         if (backing_object->shadow != VM_OBJECT_NULL) {
5313                 vm_object_lock(backing_object->shadow);
5314                 vm_object_lock_assert_exclusive(backing_object->shadow);
5315                 backing_object->shadow->ref_count++;
5316                 if (object->res_count != 0)
5317                         vm_object_res_reference(backing_object->shadow);
5318                 vm_object_unlock(backing_object->shadow);
5319         }
5320 #else   /* TASK_SWAPPER */
5321         vm_object_reference(backing_object->shadow);
5322 #endif  /* TASK_SWAPPER */
5323
5324         assert(!object->phys_contiguous);
5325         assert(!backing_object->phys_contiguous);
5326         object->shadow = backing_object->shadow;
5327         if (object->shadow) {
5328                 object->vo_shadow_offset += backing_object->vo_shadow_offset;
5329         } else {
5330                 /* no shadow, therefore no shadow offset... */
5331                 object->vo_shadow_offset = 0;
5332         }
5333
5334         /*
5335          *      Backing object might have had a copy pointer
5336          *      to us.  If it did, clear it.
5337          */
5338         if (backing_object->copy == object) {
5339                 backing_object->copy = VM_OBJECT_NULL;
5340         }
5341
5342         /*
5343          *      Drop the reference count on backing_object.
5344 #if     TASK_SWAPPER
5345          *      Since its ref_count was at least 2, it
5346          *      will not vanish; so we don't need to call
5347          *      vm_object_deallocate.
5348          *      [with a caveat for "named" objects]
5349          *
5350          *      The res_count on the backing object is
5351          *      conditionally decremented.  It's possible
5352          *      (via vm_pageout_scan) to get here with
5353          *      a "swapped" object, which has a 0 res_count,
5354          *      in which case, the backing object res_count
5355          *      is already down by one.
5356 #else
5357          *      Don't call vm_object_deallocate unless
5358          *      ref_count drops to zero.
5359          *
5360          *      The ref_count can drop to zero here if the
5361          *      backing object could be bypassed but not
5362          *      collapsed, such as when the backing object
5363          *      is temporary and cachable.
5364 #endif
5365          */
5366         if (backing_object->ref_count > 2 ||
5367             (!backing_object->named && backing_object->ref_count > 1)) {
5368                 vm_object_lock_assert_exclusive(backing_object);
5369                 backing_object->ref_count--;
5370 #if     TASK_SWAPPER
5371                 if (object->res_count != 0)
5372                         vm_object_res_deallocate(backing_object);
5373                 assert(backing_object->ref_count > 0);
5374 #endif  /* TASK_SWAPPER */
5375                 vm_object_unlock(backing_object);
5376         } else {
5377
5378                 /*
5379                  *      Drop locks so that we can deallocate
5380                  *      the backing object.
5381                  */
5382
5383 #if     TASK_SWAPPER
5384                 if (object->res_count == 0) {
5385                         /* XXX get a reference for the deallocate below */
5386                         vm_object_res_reference(backing_object);
5387                 }
5388 #endif  /* TASK_SWAPPER */
5389                 /*
5390                  * vm_object_collapse (the caller of this function) is
5391                  * now called from contexts that may not guarantee that a
5392                  * valid reference is held on the object... w/o a valid
5393                  * reference, it is unsafe and unwise (you will definitely
5394                  * regret it) to unlock the object and then retake the lock
5395                  * since the object may be terminated and recycled in between.
5396                  * The "activity_in_progress" reference will keep the object
5397                  * 'stable'.
5398                  */
5399                 vm_object_activity_begin(object);
5400                 vm_object_unlock(object);
5401
5402                 vm_object_unlock(backing_object);
5403                 vm_object_deallocate(backing_object);
5404
5405                 /*
5406                  *      Relock object. We don't have to reverify
5407                  *      its state since vm_object_collapse will
5408                  *      do that for us as it starts at the
5409                  *      top of its loop.
5410                  */
5411
5412                 vm_object_lock(object);
5413                 vm_object_activity_end(object);
5414         }
5415
5416         object_bypasses++;
5417 }
5418
5419
5420 /*
5421  *      vm_object_collapse:
5422  *
5423  *      Perform an object collapse or an object bypass if appropriate.
5424  *      The real work of collapsing and bypassing is performed in
5425  *      the routines vm_object_do_collapse and vm_object_do_bypass.
5426  *
5427  *      Requires that the object be locked and the page queues be unlocked.
5428  *
5429  */
5430 static unsigned long vm_object_collapse_calls = 0;
5431 static unsigned long vm_object_collapse_objects = 0;
5432 static unsigned long vm_object_collapse_do_collapse = 0;
5433 static unsigned long vm_object_collapse_do_bypass = 0;
5434
5435 __private_extern__ void
5436 vm_object_collapse(
5437         register vm_object_t                    object,
5438         register vm_object_offset_t             hint_offset,
5439         boolean_t                               can_bypass)
5440 {
5441         register vm_object_t                    backing_object;
5442         register unsigned int                   rcount;
5443         register unsigned int                   size;
5444         vm_object_t                             original_object;
5445         int                                     object_lock_type;
5446         int                                     backing_object_lock_type;
5447
5448         vm_object_collapse_calls++;
5449
5450         if (! vm_object_collapse_allowed &&
5451             ! (can_bypass && vm_object_bypass_allowed)) {
5452                 return;
5453         }
5454
5455         XPR(XPR_VM_OBJECT, "vm_object_collapse, obj 0x%X\n",
5456                 object, 0,0,0,0);
5457
5458         if (object == VM_OBJECT_NULL)
5459                 return;
5460
5461         original_object = object;
5462
5463         /*
5464          * The top object was locked "exclusive" by the caller.
5465          * In the first pass, to determine if we can collapse the shadow chain,
5466          * take a "shared" lock on the shadow objects.  If we can collapse,
5467          * we'll have to go down the chain again with exclusive locks.
5468          */
5469         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5470         backing_object_lock_type = OBJECT_LOCK_SHARED;
5471
5472 retry:
5473         object = original_object;
5474         vm_object_lock_assert_exclusive(object);
5475
5476         while (TRUE) {
5477                 vm_object_collapse_objects++;
5478                 /*
5479                  *      Verify that the conditions are right for either
5480                  *      collapse or bypass:
5481                  */
5482
5483                 /*
5484                  *      There is a backing object, and
5485                  */
5486
5487                 backing_object = object->shadow;
5488                 if (backing_object == VM_OBJECT_NULL) {
5489                         if (object != original_object) {
5490                                 vm_object_unlock(object);
5491                         }
5492                         return;
5493                 }
5494                 if (backing_object_lock_type == OBJECT_LOCK_SHARED) {
5495                         vm_object_lock_shared(backing_object);
5496                 } else {
5497                         vm_object_lock(backing_object);
5498                 }
5499
5500                 /*
5501                  *      No pages in the object are currently
5502                  *      being paged out, and
5503                  */
5504                 if (object->paging_in_progress != 0 ||
5505                     object->activity_in_progress != 0) {
5506                         /* try and collapse the rest of the shadow chain */
5507                         if (object != original_object) {
5508                                 vm_object_unlock(object);
5509                         }
5510                         object = backing_object;
5511                         object_lock_type = backing_object_lock_type;
5512                         continue;
5513                 }
5514
5515                 /*
5516                  *      ...
5517                  *              The backing object is not read_only,
5518                  *              and no pages in the backing object are
5519                  *              currently being paged out.
5520                  *              The backing object is internal.
5521                  *
5522                  */
5523
5524                 if (!backing_object->internal ||
5525                     backing_object->paging_in_progress != 0 ||
5526                     backing_object->activity_in_progress != 0) {
5527                         /* try and collapse the rest of the shadow chain */
5528                         if (object != original_object) {
5529                                 vm_object_unlock(object);
5530                         }
5531                         object = backing_object;
5532                         object_lock_type = backing_object_lock_type;
5533                         continue;
5534                 }
5535
5536                 /*
5537                  * Purgeable objects are not supposed to engage in
5538                  * copy-on-write activities, so should not have
5539                  * any shadow objects or be a shadow object to another
5540                  * object.
5541                  * Collapsing a purgeable object would require some
5542                  * updates to the purgeable compressed ledgers.
5543                  */
5544                 if (object->purgable != VM_PURGABLE_DENY ||
5545                     backing_object->purgable != VM_PURGABLE_DENY) {
5546                         panic("vm_object_collapse() attempting to collapse "
5547                               "purgeable object: %p(%d) %p(%d)\n",
5548                               object, object->purgable,
5549                               backing_object, backing_object->purgable);
5550                         /* try and collapse the rest of the shadow chain */
5551                         if (object != original_object) {
5552                                 vm_object_unlock(object);
5553                         }
5554                         object = backing_object;
5555                         object_lock_type = backing_object_lock_type;
5556                         continue;
5557                 }
5558
5559                 /*
5560                  *      The backing object can't be a copy-object:
5561                  *      the shadow_offset for the copy-object must stay
5562                  *      as 0.  Furthermore (for the 'we have all the
5563                  *      pages' case), if we bypass backing_object and
5564                  *      just shadow the next object in the chain, old
5565                  *      pages from that object would then have to be copied
5566                  *      BOTH into the (former) backing_object and into the
5567                  *      parent object.
5568                  */
5569                 if (backing_object->shadow != VM_OBJECT_NULL &&
5570                     backing_object->shadow->copy == backing_object) {
5571                         /* try and collapse the rest of the shadow chain */
5572                         if (object != original_object) {
5573                                 vm_object_unlock(object);
5574                         }
5575                         object = backing_object;
5576                         object_lock_type = backing_object_lock_type;
5577                         continue;
5578                 }
5579
5580                 /*
5581                  *      We can now try to either collapse the backing
5582                  *      object (if the parent is the only reference to
5583                  *      it) or (perhaps) remove the parent's reference
5584                  *      to it.
5585                  *
5586                  *      If there is exactly one reference to the backing
5587                  *      object, we may be able to collapse it into the
5588                  *      parent.
5589                  *
5590                  *      If MACH_PAGEMAP is defined:
5591                  *      The parent must not have a pager created for it,
5592                  *      since collapsing a backing_object dumps new pages
5593                  *      into the parent that its pager doesn't know about
5594                  *      (and the collapse code can't merge the existence
5595                  *      maps).
5596                  *      Otherwise:
5597                  *      As long as one of the objects is still not known
5598                  *      to the pager, we can collapse them.
5599                  */
5600                 if (backing_object->ref_count == 1 &&
5601                     (vm_object_collapse_compressor_allowed ||
5602                      !object->pager_created
5603 #if     !MACH_PAGEMAP
5604                      || (!backing_object->pager_created)
5605 #endif  /*!MACH_PAGEMAP */
5606                     ) && vm_object_collapse_allowed) {
5607
5608                         /*
5609                          * We need the exclusive lock on the VM objects.
5610                          */
5611                         if (backing_object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
5612                                 /*
5613                                  * We have an object and its shadow locked
5614                                  * "shared".  We can't just upgrade the locks
5615                                  * to "exclusive", as some other thread might
5616                                  * also have these objects locked "shared" and
5617                                  * attempt to upgrade one or the other to
5618                                  * "exclusive".  The upgrades would block
5619                                  * forever waiting for the other "shared" locks
5620                                  * to get released.
5621                                  * So we have to release the locks and go
5622                                  * down the shadow chain again (since it could
5623                                  * have changed) with "exclusive" locking.
5624                                  */
5625                                 vm_object_unlock(backing_object);
5626                                 if (object != original_object)
5627                                         vm_object_unlock(object);
5628                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5629                                 backing_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5630                                 goto retry;
5631                         }
5632
5633                         XPR(XPR_VM_OBJECT,
5634                    "vm_object_collapse: %x to %x, pager %x, pager_control %x\n",
5635                                 backing_object, object,
5636                                 backing_object->pager,
5637                                 backing_object->pager_control, 0);
5638
5639                         /*
5640                          *      Collapse the object with its backing
5641                          *      object, and try again with the object's
5642                          *      new backing object.
5643                          */
5644
5645                         vm_object_do_collapse(object, backing_object);
5646                         vm_object_collapse_do_collapse++;
5647                         continue;
5648                 }
5649
5650                 /*
5651                  *      Collapsing the backing object was not possible
5652                  *      or permitted, so let's try bypassing it.
5653                  */
5654
5655                 if (! (can_bypass && vm_object_bypass_allowed)) {
5656                         /* try and collapse the rest of the shadow chain */
5657                         if (object != original_object) {
5658                                 vm_object_unlock(object);
5659                         }
5660                         object = backing_object;
5661                         object_lock_type = backing_object_lock_type;
5662                         continue;
5663                 }
5664
5665
5666                 /*
5667                  *      If the object doesn't have all its pages present,
5668                  *      we have to make sure no pages in the backing object
5669                  *      "show through" before bypassing it.
5670                  */
5671                 size = (unsigned int)atop(object->vo_size);
5672                 rcount = object->resident_page_count;
5673
5674                 if (rcount != size) {
5675                         vm_object_offset_t      offset;
5676                         vm_object_offset_t      backing_offset;
5677                         unsigned int            backing_rcount;
5678
5679                         /*
5680                          *      If the backing object has a pager but no pagemap,
5681                          *      then we cannot bypass it, because we don't know
5682                          *      what pages it has.
5683                          */
5684                         if (backing_object->pager_created
5685 #if     MACH_PAGEMAP
5686                             && (backing_object->existence_map == VM_EXTERNAL_NULL)
5687 #endif  /* MACH_PAGEMAP */
5688                                 ) {
5689                                 /* try and collapse the rest of the shadow chain */
5690                                 if (object != original_object) {
5691                                         vm_object_unlock(object);
5692                                 }
5693                                 object = backing_object;
5694                                 object_lock_type = backing_object_lock_type;
5695                                 continue;
5696                         }
5697
5698                         /*
5699                          *      If the object has a pager but no pagemap,
5700                          *      then we cannot bypass it, because we don't know
5701                          *      what pages it has.
5702                          */
5703                         if (object->pager_created
5704 #if     MACH_PAGEMAP
5705                             && (object->existence_map == VM_EXTERNAL_NULL)
5706 #endif  /* MACH_PAGEMAP */
5707                                 ) {
5708                                 /* try and collapse the rest of the shadow chain */
5709                                 if (object != original_object) {
5710                                         vm_object_unlock(object);
5711                                 }
5712                                 object = backing_object;
5713                                 object_lock_type = backing_object_lock_type;
5714                                 continue;
5715                         }
5716
5717                         backing_offset = object->vo_shadow_offset;
5718                         backing_rcount = backing_object->resident_page_count;
5719
5720                         if ( (int)backing_rcount - (int)(atop(backing_object->vo_size) - size) > (int)rcount) {
5721                                 /*
5722                                  * we have enough pages in the backing object to guarantee that
5723                                  * at least 1 of them must be 'uncovered' by a resident page
5724                                  * in the object we're evaluating, so move on and
5725                                  * try to collapse the rest of the shadow chain
5726                                  */
5727                                 if (object != original_object) {
5728                                         vm_object_unlock(object);
5729                                 }
5730                                 object = backing_object;
5731                                 object_lock_type = backing_object_lock_type;
5732                                 continue;
5733                         }
5734
5735                         /*
5736                          *      If all of the pages in the backing object are
5737                          *      shadowed by the parent object, the parent
5738                          *      object no longer has to shadow the backing
5739                          *      object; it can shadow the next one in the
5740                          *      chain.
5741                          *
5742                          *      If the backing object has existence info,
5743                          *      we must check examine its existence info
5744                          *      as well.
5745                          *
5746                          */
5747
5748 #if     MACH_PAGEMAP
5749 #define EXISTS_IN_OBJECT(obj, off, rc) \
5750         ((vm_external_state_get((obj)->existence_map,   \
5751                                 (vm_offset_t)(off))     \
5752           == VM_EXTERNAL_STATE_EXISTS) ||               \
5753          (VM_COMPRESSOR_PAGER_STATE_GET((obj), (off))   \
5754           == VM_EXTERNAL_STATE_EXISTS) ||               \
5755          ((rc) && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--))
5756 #else   /* MACH_PAGEMAP */
5757 #define EXISTS_IN_OBJECT(obj, off, rc)                  \
5758         ((VM_COMPRESSOR_PAGER_STATE_GET((obj), (off))   \
5759           == VM_EXTERNAL_STATE_EXISTS) ||               \
5760          ((rc) && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--))
5761 #endif  /* MACH_PAGEMAP */
5762
5763                         /*
5764                          * Check the hint location first
5765                          * (since it is often the quickest way out of here).
5766                          */
5767                         if (object->cow_hint != ~(vm_offset_t)0)
5768                                 hint_offset = (vm_object_offset_t)object->cow_hint;
5769                         else
5770                                 hint_offset = (hint_offset > 8 * PAGE_SIZE_64) ?
5771                                               (hint_offset - 8 * PAGE_SIZE_64) : 0;
5772
5773                         if (EXISTS_IN_OBJECT(backing_object, hint_offset +
5774                                              backing_offset, backing_rcount) &&
5775                             !EXISTS_IN_OBJECT(object, hint_offset, rcount)) {
5776                                 /* dependency right at the hint */
5777                                 object->cow_hint = (vm_offset_t) hint_offset; /* atomic */
5778                                 /* try and collapse the rest of the shadow chain */
5779                                 if (object != original_object) {
5780                                         vm_object_unlock(object);
5781                                 }
5782                                 object = backing_object;
5783                                 object_lock_type = backing_object_lock_type;
5784                                 continue;
5785                         }
5786
5787                         /*
5788                          * If the object's window onto the backing_object
5789                          * is large compared to the number of resident
5790                          * pages in the backing object, it makes sense to
5791                          * walk the backing_object's resident pages first.
5792                          *
5793                          * NOTE: Pages may be in both the existence map and/or
5794                          * resident, so if we don't find a dependency while
5795                          * walking the backing object's resident page list
5796                          * directly, and there is an existence map, we'll have
5797                          * to run the offset based 2nd pass.  Because we may
5798                          * have to run both passes, we need to be careful
5799                          * not to decrement 'rcount' in the 1st pass
5800                          */
5801                         if (backing_rcount && backing_rcount < (size / 8)) {
5802                                 unsigned int rc = rcount;
5803                                 vm_page_t p;
5804
5805                                 backing_rcount = backing_object->resident_page_count;
5806                                 p = (vm_page_t)queue_first(&backing_object->memq);
5807                                 do {
5808                                         offset = (p->offset - backing_offset);
5809
5810                                         if (offset < object->vo_size &&
5811                                             offset != hint_offset &&
5812                                             !EXISTS_IN_OBJECT(object, offset, rc)) {
5813                                                 /* found a dependency */
5814                                                 object->cow_hint = (vm_offset_t) offset; /* atomic */
5815
5816                                                 break;
5817                                         }
5818                                         p = (vm_page_t) queue_next(&p->listq);
5819
5820                                 } while (--backing_rcount);
5821                                 if (backing_rcount != 0 ) {
5822                                         /* try and collapse the rest of the shadow chain */
5823                                         if (object != original_object) {
5824                                                 vm_object_unlock(object);
5825                                         }
5826                                         object = backing_object;
5827                                         object_lock_type = backing_object_lock_type;
5828                                         continue;
5829                                 }
5830                         }
5831
5832                         /*
5833                          * Walk through the offsets looking for pages in the
5834                          * backing object that show through to the object.
5835                          */
5836                         if (backing_rcount
5837 #if MACH_PAGEMAP
5838                             || backing_object->existence_map
5839 #endif  /* MACH_PAGEMAP */
5840                                 ) {
5841                                 offset = hint_offset;
5842
5843                                 while((offset =
5844                                       (offset + PAGE_SIZE_64 < object->vo_size) ?
5845                                       (offset + PAGE_SIZE_64) : 0) != hint_offset) {
5846
5847                                         if (EXISTS_IN_OBJECT(backing_object, offset +
5848                                             backing_offset, backing_rcount) &&
5849                                             !EXISTS_IN_OBJECT(object, offset, rcount)) {
5850                                                 /* found a dependency */
5851                                                 object->cow_hint = (vm_offset_t) offset; /* atomic */
5852                                                 break;
5853                                         }
5854                                 }
5855                                 if (offset != hint_offset) {
5856                                         /* try and collapse the rest of the shadow chain */
5857                                         if (object != original_object) {
5858                                                 vm_object_unlock(object);
5859                                         }
5860                                         object = backing_object;
5861                                         object_lock_type = backing_object_lock_type;
5862                                         continue;
5863                                 }
5864                         }
5865                 }
5866
5867                 /*
5868                  * We need "exclusive" locks on the 2 VM objects.
5869                  */
5870                 if (backing_object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
5871                         vm_object_unlock(backing_object);
5872                         if (object != original_object)
5873                                 vm_object_unlock(object);
5874                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5875                         backing_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5876                         goto retry;
5877                 }
5878
5879                 /* reset the offset hint for any objects deeper in the chain */
5880                 object->cow_hint = (vm_offset_t)0;
5881
5882                 /*
5883                  *      All interesting pages in the backing object
5884                  *      already live in the parent or its pager.
5885                  *      Thus we can bypass the backing object.
5886                  */
5887
5888                 vm_object_do_bypass(object, backing_object);
5889                 vm_object_collapse_do_bypass++;
5890
5891                 /*
5892                  *      Try again with this object's new backing object.
5893                  */
5894
5895                 continue;
5896         }
5897
5898         /* NOT REACHED */
5899         /*
5900         if (object != original_object) {
5901                 vm_object_unlock(object);
5902         }
5903         */
5904 }
5905
5906 /*
5907  *      Routine:        vm_object_page_remove: [internal]
5908  *      Purpose:
5909  *              Removes all physical pages in the specified
5910  *              object range from the object's list of pages.
5911  *
5912  *      In/out conditions:
5913  *              The object must be locked.
5914  *              The object must not have paging_in_progress, usually
5915  *              guaranteed by not having a pager.
5916  */
5917 unsigned int vm_object_page_remove_lookup = 0;
5918 unsigned int vm_object_page_remove_iterate = 0;
5919
5920 __private_extern__ void
5921 vm_object_page_remove(
5922         register vm_object_t            object,
5923         register vm_object_offset_t     start,
5924         register vm_object_offset_t     end)
5925 {
5926         register vm_page_t      p, next;
5927
5928         /*
5929          *      One and two page removals are most popular.
5930          *      The factor of 16 here is somewhat arbitrary.
5931          *      It balances vm_object_lookup vs iteration.
5932          */
5933
5934         if (atop_64(end - start) < (unsigned)object->resident_page_count/16) {
5935                 vm_object_page_remove_lookup++;
5936
5937                 for (; start < end; start += PAGE_SIZE_64) {
5938                         p = vm_page_lookup(object, start);
5939                         if (p != VM_PAGE_NULL) {
5940                                 assert(!p->cleaning && !p->pageout && !p->laundry);
5941                                 if (!p->fictitious && p->pmapped)
5942                                         pmap_disconnect(p->phys_page);
5943                                 VM_PAGE_FREE(p);
5944                         }
5945                 }
5946         } else {
5947                 vm_object_page_remove_iterate++;
5948
5949                 p = (vm_page_t) queue_first(&object->memq);
5950                 while (!queue_end(&object->memq, (queue_entry_t) p)) {
5951                         next = (vm_page_t) queue_next(&p->listq);
5952                         if ((start <= p->offset) && (p->offset < end)) {
5953                                 assert(!p->cleaning && !p->pageout && !p->laundry);
5954                                 if (!p->fictitious && p->pmapped)
5955                                         pmap_disconnect(p->phys_page);
5956                                 VM_PAGE_FREE(p);
5957                         }
5958                         p = next;
5959                 }
5960         }
5961 }
5962
5963
5964 /*
5965  *      Routine:        vm_object_coalesce
5966  *      Function:       Coalesces two objects backing up adjoining
5967  *                      regions of memory into a single object.
5968  *
5969  *      returns TRUE if objects were combined.
5970  *
5971  *      NOTE:   Only works at the moment if the second object is NULL -
5972  *              if it's not, which object do we lock first?
5973  *
5974  *      Parameters:
5975  *              prev_object     First object to coalesce
5976  *              prev_offset     Offset into prev_object
5977  *              next_object     Second object into coalesce
5978  *              next_offset     Offset into next_object
5979  *
5980  *              prev_size       Size of reference to prev_object
5981  *              next_size       Size of reference to next_object
5982  *
5983  *      Conditions:
5984  *      The object(s) must *not* be locked. The map must be locked
5985  *      to preserve the reference to the object(s).
5986  */
5987 static int vm_object_coalesce_count = 0;
5988
5989 __private_extern__ boolean_t
5990 vm_object_coalesce(
5991         register vm_object_t            prev_object,
5992         vm_object_t                     next_object,
5993         vm_object_offset_t              prev_offset,
5994         __unused vm_object_offset_t next_offset,
5995         vm_object_size_t                prev_size,
5996         vm_object_size_t                next_size)
5997 {
5998         vm_object_size_t        newsize;
5999
6000 #ifdef  lint
6001         next_offset++;
6002 #endif  /* lint */
6003
6004         if (next_object != VM_OBJECT_NULL) {
6005                 return(FALSE);
6006         }
6007
6008         if (prev_object == VM_OBJECT_NULL) {
6009                 return(TRUE);
6010         }
6011
6012         XPR(XPR_VM_OBJECT,
6013        "vm_object_coalesce: 0x%X prev_off 0x%X prev_size 0x%X next_size 0x%X\n",
6014                 prev_object, prev_offset, prev_size, next_size, 0);
6015
6016         vm_object_lock(prev_object);
6017
6018         /*
6019          *      Try to collapse the object first
6020          */
6021         vm_object_collapse(prev_object, prev_offset, TRUE);
6022
6023         /*
6024          *      Can't coalesce if pages not mapped to
6025          *      prev_entry may be in use any way:
6026          *      . more than one reference
6027          *      . paged out
6028          *      . shadows another object
6029          *      . has a copy elsewhere
6030          *      . is purgeable
6031          *      . paging references (pages might be in page-list)
6032          */
6033
6034         if ((prev_object->ref_count > 1) ||
6035             prev_object->pager_created ||
6036             (prev_object->shadow != VM_OBJECT_NULL) ||
6037             (prev_object->copy != VM_OBJECT_NULL) ||
6038             (prev_object->true_share != FALSE) ||
6039             (prev_object->purgable != VM_PURGABLE_DENY) ||
6040             (prev_object->paging_in_progress != 0) ||
6041             (prev_object->activity_in_progress != 0)) {
6042                 vm_object_unlock(prev_object);
6043                 return(FALSE);
6044         }
6045
6046         vm_object_coalesce_count++;
6047
6048         /*
6049          *      Remove any pages that may still be in the object from
6050          *      a previous deallocation.
6051          */
6052         vm_object_page_remove(prev_object,
6053                 prev_offset + prev_size,
6054                 prev_offset + prev_size + next_size);
6055
6056         /*
6057          *      Extend the object if necessary.
6058          */
6059         newsize = prev_offset + prev_size + next_size;
6060         if (newsize > prev_object->vo_size) {
6061 #if     MACH_PAGEMAP
6062                 /*
6063                  *      We cannot extend an object that has existence info,
6064                  *      since the existence info might then fail to cover
6065                  *      the entire object.
6066                  *
6067                  *      This assertion must be true because the object
6068                  *      has no pager, and we only create existence info
6069                  *      for objects with pagers.
6070                  */
6071                 assert(prev_object->existence_map == VM_EXTERNAL_NULL);
6072 #endif  /* MACH_PAGEMAP */
6073                 prev_object->vo_size = newsize;
6074         }
6075
6076         vm_object_unlock(prev_object);
6077         return(TRUE);
6078 }
6079
6080 /*
6081  *      Attach a set of physical pages to an object, so that they can
6082  *      be mapped by mapping the object.  Typically used to map IO memory.
6083  *
6084  *      The mapping function and its private data are used to obtain the
6085  *      physical addresses for each page to be mapped.
6086  */
6087 void
6088 vm_object_page_map(
6089         vm_object_t             object,
6090         vm_object_offset_t      offset,
6091         vm_object_size_t        size,
6092         vm_object_offset_t      (*map_fn)(void *map_fn_data,
6093                 vm_object_offset_t offset),
6094                 void            *map_fn_data)   /* private to map_fn */
6095 {
6096         int64_t num_pages;
6097         int     i;
6098         vm_page_t       m;
6099         vm_page_t       old_page;
6100         vm_object_offset_t      addr;
6101
6102         num_pages = atop_64(size);
6103
6104         for (i = 0; i < num_pages; i++, offset += PAGE_SIZE_64) {
6105
6106             addr = (*map_fn)(map_fn_data, offset);
6107
6108             while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL)
6109                 vm_page_more_fictitious();
6110
6111             vm_object_lock(object);
6112             if ((old_page = vm_page_lookup(object, offset))
6113                         != VM_PAGE_NULL)
6114             {
6115                     VM_PAGE_FREE(old_page);
6116             }
6117
6118             assert((ppnum_t) addr == addr);
6119             vm_page_init(m, (ppnum_t) addr, FALSE);
6120             /*
6121              * private normally requires lock_queues but since we
6122              * are initializing the page, its not necessary here
6123              */
6124             m->private = TRUE;          /* don`t free page */
6125             m->wire_count = 1;
6126             vm_page_insert(m, object, offset);
6127
6128             PAGE_WAKEUP_DONE(m);
6129             vm_object_unlock(object);
6130         }
6131 }
6132
6133 kern_return_t
6134 vm_object_populate_with_private(
6135                 vm_object_t             object,
6136                 vm_object_offset_t      offset,
6137                 ppnum_t                 phys_page,
6138                 vm_size_t               size)
6139 {
6140         ppnum_t                 base_page;
6141         vm_object_offset_t      base_offset;
6142
6143
6144         if (!object->private)
6145                 return KERN_FAILURE;
6146
6147         base_page = phys_page;
6148
6149         vm_object_lock(object);
6150
6151         if (!object->phys_contiguous) {
6152                 vm_page_t       m;
6153
6154                 if ((base_offset = trunc_page_64(offset)) != offset) {
6155                         vm_object_unlock(object);
6156                         return KERN_FAILURE;
6157                 }
6158                 base_offset += object->paging_offset;
6159
6160                 while (size) {
6161                         m = vm_page_lookup(object, base_offset);
6162
6163                         if (m != VM_PAGE_NULL) {
6164                                 if (m->fictitious) {
6165                                         if (m->phys_page != vm_page_guard_addr) {
6166
6167                                                 vm_page_lockspin_queues();
6168                                                 m->private = TRUE;
6169                                                 vm_page_unlock_queues();
6170
6171                                                 m->fictitious = FALSE;
6172                                                 m->phys_page = base_page;
6173                                         }
6174                                 } else if (m->phys_page != base_page) {
6175
6176                                         if ( !m->private) {
6177                                                 /*
6178                                                  * we'd leak a real page... that can't be right
6179                                                  */
6180                                                 panic("vm_object_populate_with_private - %p not private", m);
6181                                         }
6182                                         if (m->pmapped) {
6183                                                 /*
6184                                                  * pmap call to clear old mapping
6185                                                  */
6186                                                 pmap_disconnect(m->phys_page);
6187                                         }
6188                                         m->phys_page = base_page;
6189                                 }
6190                                 if (m->encrypted) {
6191                                         /*
6192                                          * we should never see this on a ficticious or private page
6193                                          */
6194                                         panic("vm_object_populate_with_private - %p encrypted", m);
6195                                 }
6196
6197                         } else {
6198                                 while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL)
6199                                         vm_page_more_fictitious();
6200
6201                                 /*
6202                                  * private normally requires lock_queues but since we
6203                                  * are initializing the page, its not necessary here
6204                                  */
6205                                 m->private = TRUE;
6206                                 m->fictitious = FALSE;
6207                                 m->phys_page = base_page;
6208                                 m->unusual = TRUE;
6209                                 m->busy = FALSE;
6210
6211                                 vm_page_insert(m, object, base_offset);
6212                         }
6213                         base_page++;                                                                    /* Go to the next physical page */
6214                         base_offset += PAGE_SIZE;
6215                         size -= PAGE_SIZE;
6216                 }
6217         } else {
6218                 /* NOTE: we should check the original settings here */
6219                 /* if we have a size > zero a pmap call should be made */
6220                 /* to disable the range */
6221
6222                 /* pmap_? */
6223
6224                 /* shadows on contiguous memory are not allowed */
6225                 /* we therefore can use the offset field */
6226                 object->vo_shadow_offset = (vm_object_offset_t)phys_page << PAGE_SHIFT;
6227                 object->vo_size = size;
6228         }
6229         vm_object_unlock(object);
6230
6231         return KERN_SUCCESS;
6232 }
6233
6234 /*
6235  *      memory_object_free_from_cache:
6236  *
6237  *      Walk the vm_object cache list, removing and freeing vm_objects
6238  *      which are backed by the pager identified by the caller, (pager_ops).
6239  *      Remove up to "count" objects, if there are that may available
6240  *      in the cache.
6241  *
6242  *      Walk the list at most once, return the number of vm_objects
6243  *      actually freed.
6244  */
6245
6246 __private_extern__ kern_return_t
6247 memory_object_free_from_cache(
6248         __unused host_t         host,
6249         __unused memory_object_pager_ops_t pager_ops,
6250         int             *count)
6251 {
6252 #if VM_OBJECT_CACHE
6253         int     object_released = 0;
6254
6255         register vm_object_t object = VM_OBJECT_NULL;
6256         vm_object_t shadow;
6257
6258 /*
6259         if(host == HOST_NULL)
6260                 return(KERN_INVALID_ARGUMENT);
6261 */
6262
6263  try_again:
6264         vm_object_cache_lock();
6265
6266         queue_iterate(&vm_object_cached_list, object,
6267                                         vm_object_t, cached_list) {
6268                 if (object->pager &&
6269                     (pager_ops == object->pager->mo_pager_ops)) {
6270                         vm_object_lock(object);
6271                         queue_remove(&vm_object_cached_list, object,
6272                                         vm_object_t, cached_list);
6273                         vm_object_cached_count--;
6274
6275                         vm_object_cache_unlock();
6276                         /*
6277                         *       Since this object is in the cache, we know
6278                         *       that it is initialized and has only a pager's
6279                         *       (implicit) reference. Take a reference to avoid
6280                         *       recursive deallocations.
6281                         */
6282
6283                         assert(object->pager_initialized);
6284                         assert(object->ref_count == 0);
6285                         vm_object_lock_assert_exclusive(object);
6286                         object->ref_count++;
6287
6288                         /*
6289                         *       Terminate the object.
6290                         *       If the object had a shadow, we let
6291                         *       vm_object_deallocate deallocate it.
6292                         *       "pageout" objects have a shadow, but
6293                         *       maintain a "paging reference" rather
6294                         *       than a normal reference.
6295                         *       (We are careful here to limit recursion.)
6296                         */
6297                         shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
6298
6299                         if ((vm_object_terminate(object) == KERN_SUCCESS)
6300                                         && (shadow != VM_OBJECT_NULL)) {
6301                                 vm_object_deallocate(shadow);
6302                         }
6303
6304                         if(object_released++ == *count)
6305                                 return KERN_SUCCESS;
6306                         goto try_again;
6307                 }
6308         }
6309         vm_object_cache_unlock();
6310         *count  = object_released;
6311 #else
6312         *count = 0;
6313 #endif
6314         return KERN_SUCCESS;
6315 }
6316
6317
6318
6319 kern_return_t
6320 memory_object_create_named(
6321         memory_object_t pager,
6322         memory_object_offset_t  size,
6323         memory_object_control_t         *control)
6324 {
6325         vm_object_t             object;
6326         vm_object_hash_entry_t  entry;
6327         lck_mtx_t               *lck;
6328
6329         *control = MEMORY_OBJECT_CONTROL_NULL;
6330         if (pager == MEMORY_OBJECT_NULL)
6331                 return KERN_INVALID_ARGUMENT;
6332
6333         lck = vm_object_hash_lock_spin(pager);
6334         entry = vm_object_hash_lookup(pager, FALSE);
6335
6336         if ((entry != VM_OBJECT_HASH_ENTRY_NULL) &&
6337                         (entry->object != VM_OBJECT_NULL)) {
6338                 if (entry->object->named == TRUE)
6339                         panic("memory_object_create_named: caller already holds the right");    }
6340         vm_object_hash_unlock(lck);
6341
6342         if ((object = vm_object_enter(pager, size, FALSE, FALSE, TRUE)) == VM_OBJECT_NULL) {
6343                 return(KERN_INVALID_OBJECT);
6344         }
6345
6346         /* wait for object (if any) to be ready */
6347         if (object != VM_OBJECT_NULL) {
6348                 vm_object_lock(object);
6349                 object->named = TRUE;
6350                 while (!object->pager_ready) {
6351                         vm_object_sleep(object,
6352                                         VM_OBJECT_EVENT_PAGER_READY,
6353                                         THREAD_UNINT);
6354                 }
6355                 *control = object->pager_control;
6356                 vm_object_unlock(object);
6357         }
6358         return (KERN_SUCCESS);
6359 }
6360
6361
6362 /*
6363  *      Routine:        memory_object_recover_named [user interface]
6364  *      Purpose:
6365  *              Attempt to recover a named reference for a VM object.
6366  *              VM will verify that the object has not already started
6367  *              down the termination path, and if it has, will optionally
6368  *              wait for that to finish.
6369  *      Returns:
6370  *              KERN_SUCCESS - we recovered a named reference on the object
6371  *              KERN_FAILURE - we could not recover a reference (object dead)
6372  *              KERN_INVALID_ARGUMENT - bad memory object control
6373  */
6374 kern_return_t
6375 memory_object_recover_named(
6376         memory_object_control_t control,
6377         boolean_t               wait_on_terminating)
6378 {
6379         vm_object_t             object;
6380
6381         object = memory_object_control_to_vm_object(control);
6382         if (object == VM_OBJECT_NULL) {
6383                 return (KERN_INVALID_ARGUMENT);
6384         }
6385 restart:
6386         vm_object_lock(object);
6387
6388         if (object->terminating && wait_on_terminating) {
6389                 vm_object_wait(object,
6390                         VM_OBJECT_EVENT_PAGING_IN_PROGRESS,
6391                         THREAD_UNINT);
6392                 goto restart;
6393         }
6394
6395         if (!object->alive) {
6396                 vm_object_unlock(object);
6397                 return KERN_FAILURE;
6398         }
6399
6400         if (object->named == TRUE) {
6401                 vm_object_unlock(object);
6402                 return KERN_SUCCESS;
6403         }
6404 #if VM_OBJECT_CACHE
6405         if ((object->ref_count == 0) && (!object->terminating)) {
6406                 if (!vm_object_cache_lock_try()) {
6407                         vm_object_unlock(object);
6408                         goto restart;
6409                 }
6410                 queue_remove(&vm_object_cached_list, object,
6411                                      vm_object_t, cached_list);
6412                 vm_object_cached_count--;
6413                 XPR(XPR_VM_OBJECT_CACHE,
6414                     "memory_object_recover_named: removing %X, head (%X, %X)\n",
6415                     object,
6416                     vm_object_cached_list.next,
6417                     vm_object_cached_list.prev, 0,0);
6418
6419                 vm_object_cache_unlock();
6420         }
6421 #endif
6422         object->named = TRUE;
6423         vm_object_lock_assert_exclusive(object);
6424         object->ref_count++;
6425         vm_object_res_reference(object);
6426         while (!object->pager_ready) {
6427                 vm_object_sleep(object,
6428                                 VM_OBJECT_EVENT_PAGER_READY,
6429                                 THREAD_UNINT);
6430         }
6431         vm_object_unlock(object);
6432         return (KERN_SUCCESS);
6433 }
6434
6435
6436 /*
6437  *      vm_object_release_name:
6438  *
6439  *      Enforces name semantic on memory_object reference count decrement
6440  *      This routine should not be called unless the caller holds a name
6441  *      reference gained through the memory_object_create_named.
6442  *
6443  *      If the TERMINATE_IDLE flag is set, the call will return if the
6444  *      reference count is not 1. i.e. idle with the only remaining reference
6445  *      being the name.
6446  *      If the decision is made to proceed the name field flag is set to
6447  *      false and the reference count is decremented.  If the RESPECT_CACHE
6448  *      flag is set and the reference count has gone to zero, the
6449  *      memory_object is checked to see if it is cacheable otherwise when
6450  *      the reference count is zero, it is simply terminated.
6451  */
6452
6453 __private_extern__ kern_return_t
6454 vm_object_release_name(
6455         vm_object_t     object,
6456         int             flags)
6457 {
6458         vm_object_t     shadow;
6459         boolean_t       original_object = TRUE;
6460
6461         while (object != VM_OBJECT_NULL) {
6462
6463                 vm_object_lock(object);
6464
6465                 assert(object->alive);
6466                 if (original_object)
6467                         assert(object->named);
6468                 assert(object->ref_count > 0);
6469
6470                 /*
6471                  *      We have to wait for initialization before
6472                  *      destroying or caching the object.
6473                  */
6474
6475                 if (object->pager_created && !object->pager_initialized) {
6476                         assert(!object->can_persist);
6477                         vm_object_assert_wait(object,
6478                                         VM_OBJECT_EVENT_INITIALIZED,
6479                                         THREAD_UNINT);
6480                         vm_object_unlock(object);
6481                         thread_block(THREAD_CONTINUE_NULL);
6482                         continue;
6483                 }
6484
6485                 if (((object->ref_count > 1)
6486                         && (flags & MEMORY_OBJECT_TERMINATE_IDLE))
6487                         || (object->terminating)) {
6488                         vm_object_unlock(object);
6489                         return KERN_FAILURE;
6490                 } else {
6491                         if (flags & MEMORY_OBJECT_RELEASE_NO_OP) {
6492                                 vm_object_unlock(object);
6493                                 return KERN_SUCCESS;
6494                         }
6495                 }
6496
6497                 if ((flags & MEMORY_OBJECT_RESPECT_CACHE) &&
6498                                         (object->ref_count == 1)) {
6499                         if (original_object)
6500                                 object->named = FALSE;
6501                         vm_object_unlock(object);
6502                         /* let vm_object_deallocate push this thing into */
6503                         /* the cache, if that it is where it is bound */
6504                         vm_object_deallocate(object);
6505                         return KERN_SUCCESS;
6506                 }
6507                 VM_OBJ_RES_DECR(object);
6508                 shadow = object->pageout?VM_OBJECT_NULL:object->shadow;
6509
6510                 if (object->ref_count == 1) {
6511                         if (vm_object_terminate(object) != KERN_SUCCESS) {
6512                                 if (original_object) {
6513                                         return KERN_FAILURE;
6514                                 } else {
6515                                         return KERN_SUCCESS;
6516                                 }
6517                         }
6518                         if (shadow != VM_OBJECT_NULL) {
6519                                 original_object = FALSE;
6520                                 object = shadow;
6521                                 continue;
6522                         }
6523                         return KERN_SUCCESS;
6524                 } else {
6525                         vm_object_lock_assert_exclusive(object);
6526                         object->ref_count--;
6527                         assert(object->ref_count > 0);
6528                         if(original_object)
6529                                 object->named = FALSE;
6530                         vm_object_unlock(object);
6531                         return KERN_SUCCESS;
6532                 }
6533         }
6534         /*NOTREACHED*/
6535         assert(0);
6536         return KERN_FAILURE;
6537 }
6538
6539
6540 __private_extern__ kern_return_t
6541 vm_object_lock_request(
6542         vm_object_t                     object,
6543         vm_object_offset_t              offset,
6544         vm_object_size_t                size,
6545         memory_object_return_t          should_return,
6546         int                             flags,
6547         vm_prot_t                       prot)
6548 {
6549         __unused boolean_t      should_flush;
6550
6551         should_flush = flags & MEMORY_OBJECT_DATA_FLUSH;
6552
6553         XPR(XPR_MEMORY_OBJECT,
6554             "vm_o_lock_request, obj 0x%X off 0x%X size 0x%X flags %X prot %X\n",
6555             object, offset, size,
6556             (((should_return&1)<<1)|should_flush), prot);
6557
6558         /*
6559          *      Check for bogus arguments.
6560          */
6561         if (object == VM_OBJECT_NULL)
6562                 return (KERN_INVALID_ARGUMENT);
6563
6564         if ((prot & ~VM_PROT_ALL) != 0 && prot != VM_PROT_NO_CHANGE)
6565                 return (KERN_INVALID_ARGUMENT);
6566
6567         size = round_page_64(size);
6568
6569         /*
6570          *      Lock the object, and acquire a paging reference to
6571          *      prevent the memory_object reference from being released.
6572          */
6573         vm_object_lock(object);
6574         vm_object_paging_begin(object);
6575
6576         (void)vm_object_update(object,
6577                 offset, size, NULL, NULL, should_return, flags, prot);
6578
6579         vm_object_paging_end(object);
6580         vm_object_unlock(object);
6581
6582         return (KERN_SUCCESS);
6583 }
6584
6585 /*
6586  * Empty a purgeable object by grabbing the physical pages assigned to it and
6587  * putting them on the free queue without writing them to backing store, etc.
6588  * When the pages are next touched they will be demand zero-fill pages.  We
6589  * skip pages which are busy, being paged in/out, wired, etc.  We do _not_
6590  * skip referenced/dirty pages, pages on the active queue, etc.  We're more
6591  * than happy to grab these since this is a purgeable object.  We mark the
6592  * object as "empty" after reaping its pages.
6593  *
6594  * On entry the object must be locked and it must be
6595  * purgeable with no delayed copies pending.
6596  */
6597 void
6598 vm_object_purge(vm_object_t object, int flags)
6599 {
6600         vm_object_lock_assert_exclusive(object);
6601
6602         if (object->purgable == VM_PURGABLE_DENY)
6603                 return;
6604
6605         assert(object->copy == VM_OBJECT_NULL);
6606         assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6607
6608         /*
6609          * We need to set the object's state to VM_PURGABLE_EMPTY *before*
6610          * reaping its pages.  We update vm_page_purgeable_count in bulk
6611          * and we don't want vm_page_remove() to update it again for each
6612          * page we reap later.
6613          *
6614          * For the purgeable ledgers, pages from VOLATILE and EMPTY objects
6615          * are all accounted for in the "volatile" ledgers, so this does not
6616          * make any difference.
6617          * If we transitioned directly from NONVOLATILE to EMPTY,
6618          * vm_page_purgeable_count must have been updated when the object
6619          * was dequeued from its volatile queue and the purgeable ledgers
6620          * must have also been updated accordingly at that time (in
6621          * vm_object_purgable_control()).
6622          */
6623         if (object->purgable == VM_PURGABLE_VOLATILE) {
6624                 unsigned int delta;
6625                 assert(object->resident_page_count >=
6626                        object->wired_page_count);
6627                 delta = (object->resident_page_count -
6628                          object->wired_page_count);
6629                 if (delta != 0) {
6630                         assert(vm_page_purgeable_count >=
6631                                delta);
6632                         OSAddAtomic(-delta,
6633                                     (SInt32 *)&vm_page_purgeable_count);
6634                 }
6635                 if (object->wired_page_count != 0) {
6636                         assert(vm_page_purgeable_wired_count >=
6637                                object->wired_page_count);
6638                         OSAddAtomic(-object->wired_page_count,
6639                                     (SInt32 *)&vm_page_purgeable_wired_count);
6640                 }
6641                 object->purgable = VM_PURGABLE_EMPTY;
6642         }
6643         assert(object->purgable == VM_PURGABLE_EMPTY);
6644
6645         vm_object_reap_pages(object, REAP_PURGEABLE);
6646
6647         if (object->pager != NULL &&
6648             COMPRESSED_PAGER_IS_ACTIVE) {
6649                 unsigned int pgcount;
6650
6651                 if (object->activity_in_progress == 0 &&
6652                     object->paging_in_progress == 0) {
6653                         /*
6654                          * Also reap any memory coming from this object
6655                          * in the VM compressor.
6656                          *
6657                          * There are no operations in progress on the VM object
6658                          * and no operation can start while we're holding the
6659                          * VM object lock, so it's safe to reap the compressed
6660                          * pages and update the page counts.
6661                          */
6662                         pgcount = vm_compressor_pager_get_count(object->pager);
6663                         if (pgcount) {
6664                                 pgcount = vm_compressor_pager_reap_pages(object->pager, flags);
6665                                 vm_compressor_pager_count(object->pager,
6666                                                           -pgcount,
6667                                                           FALSE, /* shared */
6668                                                           object);
6669                                 vm_purgeable_compressed_update(object,
6670                                                                -pgcount);
6671                         }
6672                         if ( !(flags & C_DONT_BLOCK)) {
6673                                 assert(vm_compressor_pager_get_count(object->pager)
6674                                        == 0);
6675                         }
6676                 } else {
6677                         /*
6678                          * There's some kind of paging activity in progress
6679                          * for this object, which could result in a page
6680                          * being compressed or decompressed, possibly while
6681                          * the VM object is not locked, so it could race
6682                          * with us.
6683                          *
6684                          * We can't really synchronize this without possibly
6685                          * causing a deadlock when the compressor needs to
6686                          * allocate or free memory while compressing or
6687                          * decompressing a page from a purgeable object
6688                          * mapped in the kernel_map...
6689                          *
6690                          * So let's not attempt to purge the compressor
6691                          * pager if there's any kind of operation in
6692                          * progress on the VM object.
6693                          */
6694                 }
6695         }
6696
6697         vm_object_lock_assert_exclusive(object);
6698 }
6699
6700
6701 /*
6702  * vm_object_purgeable_control() allows the caller to control and investigate the
6703  * state of a purgeable object.  A purgeable object is created via a call to
6704  * vm_allocate() with VM_FLAGS_PURGABLE specified.  A purgeable object will
6705  * never be coalesced with any other object -- even other purgeable objects --
6706  * and will thus always remain a distinct object.  A purgeable object has
6707  * special semantics when its reference count is exactly 1.  If its reference
6708  * count is greater than 1, then a purgeable object will behave like a normal
6709  * object and attempts to use this interface will result in an error return
6710  * of KERN_INVALID_ARGUMENT.
6711  *
6712  * A purgeable object may be put into a "volatile" state which will make the
6713  * object's pages elligable for being reclaimed without paging to backing
6714  * store if the system runs low on memory.  If the pages in a volatile
6715  * purgeable object are reclaimed, the purgeable object is said to have been
6716  * "emptied."  When a purgeable object is emptied the system will reclaim as
6717  * many pages from the object as it can in a convenient manner (pages already
6718  * en route to backing store or busy for other reasons are left as is).  When
6719  * a purgeable object is made volatile, its pages will generally be reclaimed
6720  * before other pages in the application's working set.  This semantic is
6721  * generally used by applications which can recreate the data in the object
6722  * faster than it can be paged in.  One such example might be media assets
6723  * which can be reread from a much faster RAID volume.
6724  *
6725  * A purgeable object may be designated as "non-volatile" which means it will
6726  * behave like all other objects in the system with pages being written to and
6727  * read from backing store as needed to satisfy system memory needs.  If the
6728  * object was emptied before the object was made non-volatile, that fact will
6729  * be returned as the old state of the purgeable object (see
6730  * VM_PURGABLE_SET_STATE below).  In this case, any pages of the object which
6731  * were reclaimed as part of emptying the object will be refaulted in as
6732  * zero-fill on demand.  It is up to the application to note that an object
6733  * was emptied and recreate the objects contents if necessary.  When a
6734  * purgeable object is made non-volatile, its pages will generally not be paged
6735  * out to backing store in the immediate future.  A purgeable object may also
6736  * be manually emptied.
6737  *
6738  * Finally, the current state (non-volatile, volatile, volatile & empty) of a
6739  * volatile purgeable object may be queried at any time.  This information may
6740  * be used as a control input to let the application know when the system is
6741  * experiencing memory pressure and is reclaiming memory.
6742  *
6743  * The specified address may be any address within the purgeable object.  If
6744  * the specified address does not represent any object in the target task's
6745  * virtual address space, then KERN_INVALID_ADDRESS will be returned.  If the
6746  * object containing the specified address is not a purgeable object, then
6747  * KERN_INVALID_ARGUMENT will be returned.  Otherwise, KERN_SUCCESS will be
6748  * returned.
6749  *
6750  * The control parameter may be any one of VM_PURGABLE_SET_STATE or
6751  * VM_PURGABLE_GET_STATE.  For VM_PURGABLE_SET_STATE, the in/out parameter
6752  * state is used to set the new state of the purgeable object and return its
6753  * old state.  For VM_PURGABLE_GET_STATE, the current state of the purgeable
6754  * object is returned in the parameter state.
6755  *
6756  * The in/out parameter state may be one of VM_PURGABLE_NONVOLATILE,
6757  * VM_PURGABLE_VOLATILE or VM_PURGABLE_EMPTY.  These, respectively, represent
6758  * the non-volatile, volatile and volatile/empty states described above.
6759  * Setting the state of a purgeable object to VM_PURGABLE_EMPTY will
6760  * immediately reclaim as many pages in the object as can be conveniently
6761  * collected (some may have already been written to backing store or be
6762  * otherwise busy).
6763  *
6764  * The process of making a purgeable object non-volatile and determining its
6765  * previous state is atomic.  Thus, if a purgeable object is made
6766  * VM_PURGABLE_NONVOLATILE and the old state is returned as
6767  * VM_PURGABLE_VOLATILE, then the purgeable object's previous contents are
6768  * completely intact and will remain so until the object is made volatile
6769  * again.  If the old state is returned as VM_PURGABLE_EMPTY then the object
6770  * was reclaimed while it was in a volatile state and its previous contents
6771  * have been lost.
6772  */
6773 /*
6774  * The object must be locked.
6775  */
6776 kern_return_t
6777 vm_object_purgable_control(
6778         vm_object_t     object,
6779         vm_purgable_t   control,
6780         int             *state)
6781 {
6782         int             old_state;
6783         int             new_state;
6784
6785         if (object == VM_OBJECT_NULL) {
6786                 /*
6787                  * Object must already be present or it can't be purgeable.
6788                  */
6789                 return KERN_INVALID_ARGUMENT;
6790         }
6791
6792         vm_object_lock_assert_exclusive(object);
6793
6794         /*
6795          * Get current state of the purgeable object.
6796          */
6797         old_state = object->purgable;
6798         if (old_state == VM_PURGABLE_DENY)
6799                 return KERN_INVALID_ARGUMENT;
6800
6801         /* purgeable cant have delayed copies - now or in the future */
6802         assert(object->copy == VM_OBJECT_NULL);
6803         assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6804
6805         /*
6806          * Execute the desired operation.
6807          */
6808         if (control == VM_PURGABLE_GET_STATE) {
6809                 *state = old_state;
6810                 return KERN_SUCCESS;
6811         }
6812
6813         if ((*state) & VM_PURGABLE_DEBUG_EMPTY) {
6814                 object->volatile_empty = TRUE;
6815         }
6816         if ((*state) & VM_PURGABLE_DEBUG_FAULT) {
6817                 object->volatile_fault = TRUE;
6818         }
6819
6820         new_state = *state & VM_PURGABLE_STATE_MASK;
6821         if (new_state == VM_PURGABLE_VOLATILE &&
6822             object->volatile_empty) {
6823                 new_state = VM_PURGABLE_EMPTY;
6824         }
6825
6826         switch (new_state) {
6827         case VM_PURGABLE_DENY:
6828         case VM_PURGABLE_NONVOLATILE:
6829                 object->purgable = new_state;
6830
6831                 if (old_state == VM_PURGABLE_VOLATILE) {
6832                         unsigned int delta;
6833
6834                         assert(object->resident_page_count >=
6835                                object->wired_page_count);
6836                         delta = (object->resident_page_count -
6837                                  object->wired_page_count);
6838
6839                         assert(vm_page_purgeable_count >= delta);
6840
6841                         if (delta != 0) {
6842                                 OSAddAtomic(-delta,
6843                                             (SInt32 *)&vm_page_purgeable_count);
6844                         }
6845                         if (object->wired_page_count != 0) {
6846                                 assert(vm_page_purgeable_wired_count >=
6847                                        object->wired_page_count);
6848                                 OSAddAtomic(-object->wired_page_count,
6849                                             (SInt32 *)&vm_page_purgeable_wired_count);
6850                         }
6851
6852                         vm_page_lock_queues();
6853
6854                         /* object should be on a queue */
6855                         assert(object->objq.next != NULL &&
6856                                object->objq.prev != NULL);
6857                         purgeable_q_t queue;
6858
6859                         /*
6860                          * Move object from its volatile queue to the
6861                          * non-volatile queue...
6862                          */
6863                         queue = vm_purgeable_object_remove(object);
6864                         assert(queue);
6865
6866                         if (object->purgeable_when_ripe) {
6867                                 vm_purgeable_token_delete_last(queue);
6868                         }
6869                         assert(queue->debug_count_objects>=0);
6870
6871                         vm_page_unlock_queues();
6872                 }
6873                 if (old_state == VM_PURGABLE_VOLATILE ||
6874                     old_state == VM_PURGABLE_EMPTY) {
6875                         /*
6876                          * Transfer the object's pages from the volatile to
6877                          * non-volatile ledgers.
6878                          */
6879                         vm_purgeable_accounting(object, VM_PURGABLE_VOLATILE,
6880                                                 FALSE);
6881                 }
6882
6883                 break;
6884
6885         case VM_PURGABLE_VOLATILE:
6886                 if (object->volatile_fault) {
6887                         vm_page_t       p;
6888                         int             refmod;
6889
6890                         queue_iterate(&object->memq, p, vm_page_t, listq) {
6891                                 if (p->busy ||
6892                                     VM_PAGE_WIRED(p) ||
6893                                     p->fictitious) {
6894                                         continue;
6895                                 }
6896                                 refmod = pmap_disconnect(p->phys_page);
6897                                 if ((refmod & VM_MEM_MODIFIED) &&
6898                                     !p->dirty) {
6899                                         SET_PAGE_DIRTY(p, FALSE);
6900                                 }
6901                         }
6902                 }
6903
6904                 if (old_state == VM_PURGABLE_EMPTY &&
6905                     object->resident_page_count == 0 &&
6906                     object->pager == NULL)
6907                         break;
6908
6909                 purgeable_q_t queue;
6910
6911                 /* find the correct queue */
6912                 if ((*state&VM_PURGABLE_ORDERING_MASK) == VM_PURGABLE_ORDERING_OBSOLETE)
6913                         queue = &purgeable_queues[PURGEABLE_Q_TYPE_OBSOLETE];
6914                 else {
6915                         if ((*state&VM_PURGABLE_BEHAVIOR_MASK) == VM_PURGABLE_BEHAVIOR_FIFO)
6916                                 queue = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO];
6917                         else
6918                                 queue = &purgeable_queues[PURGEABLE_Q_TYPE_LIFO];
6919                 }
6920
6921                 if (old_state == VM_PURGABLE_NONVOLATILE ||
6922                     old_state == VM_PURGABLE_EMPTY) {
6923                         unsigned int delta;
6924
6925                         if ((*state & VM_PURGABLE_NO_AGING_MASK) ==
6926                             VM_PURGABLE_NO_AGING) {
6927                                 object->purgeable_when_ripe = FALSE;
6928                         } else {
6929                                 object->purgeable_when_ripe = TRUE;
6930                         }
6931
6932                         if (object->purgeable_when_ripe) {
6933                                 kern_return_t result;
6934
6935                                 /* try to add token... this can fail */
6936                                 vm_page_lock_queues();
6937
6938                                 result = vm_purgeable_token_add(queue);
6939                                 if (result != KERN_SUCCESS) {
6940                                         vm_page_unlock_queues();
6941                                         return result;
6942                                 }
6943                                 vm_page_unlock_queues();
6944                         }
6945
6946                         assert(object->resident_page_count >=
6947                                object->wired_page_count);
6948                         delta = (object->resident_page_count -
6949                                  object->wired_page_count);
6950
6951                         if (delta != 0) {
6952                                 OSAddAtomic(delta,
6953                                             &vm_page_purgeable_count);
6954                         }
6955                         if (object->wired_page_count != 0) {
6956                                 OSAddAtomic(object->wired_page_count,
6957                                             &vm_page_purgeable_wired_count);
6958                         }
6959
6960                         object->purgable = new_state;
6961
6962                         /* object should be on "non-volatile" queue */
6963                         assert(object->objq.next != NULL);
6964                         assert(object->objq.prev != NULL);
6965                 }
6966                 else if (old_state == VM_PURGABLE_VOLATILE) {
6967                         purgeable_q_t   old_queue;
6968                         boolean_t       purgeable_when_ripe;
6969
6970                         /*
6971                          * if reassigning priorities / purgeable groups, we don't change the
6972                          * token queue. So moving priorities will not make pages stay around longer.
6973                          * Reasoning is that the algorithm gives most priority to the most important
6974                          * object. If a new token is added, the most important object' priority is boosted.
6975                          * This biases the system already for purgeable queues that move a lot.
6976                          * It doesn't seem more biasing is neccessary in this case, where no new object is added.
6977                          */
6978                         assert(object->objq.next != NULL && object->objq.prev != NULL); /* object should be on a queue */
6979
6980                         old_queue = vm_purgeable_object_remove(object);
6981                         assert(old_queue);
6982
6983                         if ((*state & VM_PURGABLE_NO_AGING_MASK) ==
6984                             VM_PURGABLE_NO_AGING) {
6985                                 purgeable_when_ripe = FALSE;
6986                         } else {
6987                                 purgeable_when_ripe = TRUE;
6988                         }
6989
6990                         if (old_queue != queue ||
6991                             (purgeable_when_ripe !=
6992                              object->purgeable_when_ripe)) {
6993                                 kern_return_t result;
6994
6995                                 /* Changing queue. Have to move token. */
6996                                 vm_page_lock_queues();
6997                                 if (object->purgeable_when_ripe) {
6998                                         vm_purgeable_token_delete_last(old_queue);
6999                                 }
7000                                 object->purgeable_when_ripe = purgeable_when_ripe;
7001                                 if (object->purgeable_when_ripe) {
7002                                         result = vm_purgeable_token_add(queue);
7003                                         assert(result==KERN_SUCCESS);   /* this should never fail since we just freed a token */
7004                                 }
7005                                 vm_page_unlock_queues();
7006
7007                         }
7008                 };
7009                 vm_purgeable_object_add(object, queue, (*state&VM_VOLATILE_GROUP_MASK)>>VM_VOLATILE_GROUP_SHIFT );
7010                 if (old_state == VM_PURGABLE_NONVOLATILE) {
7011                         vm_purgeable_accounting(object, VM_PURGABLE_NONVOLATILE,
7012                                                 FALSE);
7013                 }
7014
7015                 assert(queue->debug_count_objects>=0);
7016
7017                 break;
7018
7019
7020         case VM_PURGABLE_EMPTY:
7021                 if (object->volatile_fault) {
7022                         vm_page_t       p;
7023                         int             refmod;
7024
7025                         queue_iterate(&object->memq, p, vm_page_t, listq) {
7026                                 if (p->busy ||
7027                                     VM_PAGE_WIRED(p) ||
7028                                     p->fictitious) {
7029                                         continue;
7030                                 }
7031                                 refmod = pmap_disconnect(p->phys_page);
7032                                 if ((refmod & VM_MEM_MODIFIED) &&
7033                                     !p->dirty) {
7034                                         SET_PAGE_DIRTY(p, FALSE);
7035                                 }
7036                         }
7037                 }
7038
7039                 if (old_state == new_state) {
7040                         /* nothing changes */
7041                         break;
7042                 }
7043
7044                 assert(old_state == VM_PURGABLE_NONVOLATILE ||
7045                        old_state == VM_PURGABLE_VOLATILE);
7046                 if (old_state == VM_PURGABLE_VOLATILE) {
7047                         purgeable_q_t old_queue;
7048
7049                         /* object should be on a queue */
7050                         assert(object->objq.next != NULL &&
7051                                object->objq.prev != NULL);
7052
7053                         old_queue = vm_purgeable_object_remove(object);
7054                         assert(old_queue);
7055                         if (object->purgeable_when_ripe) {
7056                                 vm_page_lock_queues();
7057                                 vm_purgeable_token_delete_first(old_queue);
7058                                 vm_page_unlock_queues();
7059                         }
7060                 }
7061
7062                 if (old_state == VM_PURGABLE_NONVOLATILE) {
7063                         /*
7064                          * This object's pages were previously accounted as
7065                          * "non-volatile" and now need to be accounted as
7066                          * "volatile".
7067                          */
7068                         vm_purgeable_accounting(object, VM_PURGABLE_NONVOLATILE,
7069                                                 FALSE);
7070                         /*
7071                          * Set to VM_PURGABLE_EMPTY because the pages are no
7072                          * longer accounted in the "non-volatile" ledger
7073                          * and are also not accounted for in
7074                          * "vm_page_purgeable_count".
7075                          */
7076                         object->purgable = VM_PURGABLE_EMPTY;
7077                 }
7078
7079                 (void) vm_object_purge(object, 0);
7080                 assert(object->purgable == VM_PURGABLE_EMPTY);
7081
7082                 break;
7083         }
7084
7085         *state = old_state;
7086
7087         vm_object_lock_assert_exclusive(object);
7088
7089         return KERN_SUCCESS;
7090 }
7091
7092 kern_return_t
7093 vm_object_get_page_counts(
7094         vm_object_t             object,
7095         vm_object_offset_t      offset,
7096         vm_object_size_t        size,
7097         unsigned int            *resident_page_count,
7098         unsigned int            *dirty_page_count)
7099 {
7100
7101         kern_return_t           kr = KERN_SUCCESS;
7102         boolean_t               count_dirty_pages = FALSE;
7103         vm_page_t               p = VM_PAGE_NULL;
7104         unsigned int            local_resident_count = 0;
7105         unsigned int            local_dirty_count = 0;
7106         vm_object_offset_t      cur_offset = 0;
7107         vm_object_offset_t      end_offset = 0;
7108
7109         if (object == VM_OBJECT_NULL)
7110                 return KERN_INVALID_ARGUMENT;
7111
7112
7113         cur_offset = offset;
7114
7115         end_offset = offset + size;
7116
7117         vm_object_lock_assert_exclusive(object);
7118
7119         if (dirty_page_count != NULL) {
7120
7121                 count_dirty_pages = TRUE;
7122         }
7123
7124         if (resident_page_count != NULL && count_dirty_pages == FALSE) {
7125                 /*
7126                  * Fast path when:
7127                  * - we only want the resident page count, and,
7128                  * - the entire object is exactly covered by the request.
7129                  */
7130                 if (offset == 0 && (object->vo_size == size)) {
7131
7132                         *resident_page_count = object->resident_page_count;
7133                         goto out;
7134                 }
7135         }
7136
7137         if (object->resident_page_count <= (size >> PAGE_SHIFT)) {
7138
7139                 queue_iterate(&object->memq, p, vm_page_t, listq) {
7140
7141                         if (p->offset >= cur_offset && p->offset < end_offset) {
7142
7143                                 local_resident_count++;
7144
7145                                 if (count_dirty_pages) {
7146
7147                                         if (p->dirty || (p->wpmapped && pmap_is_modified(p->phys_page))) {
7148
7149                                                 local_dirty_count++;
7150                                         }
7151                                 }
7152                         }
7153                 }
7154         } else {
7155
7156                 for (cur_offset = offset; cur_offset < end_offset; cur_offset += PAGE_SIZE_64) {
7157
7158                         p = vm_page_lookup(object, cur_offset);
7159
7160                         if (p != VM_PAGE_NULL) {
7161
7162                                 local_resident_count++;
7163
7164                                 if (count_dirty_pages) {
7165
7166                                         if (p->dirty || (p->wpmapped && pmap_is_modified(p->phys_page))) {
7167
7168                                                 local_dirty_count++;
7169                                         }
7170                                 }
7171                         }
7172                 }
7173
7174         }
7175
7176         if (resident_page_count != NULL) {
7177                 *resident_page_count = local_resident_count;
7178         }
7179
7180         if (dirty_page_count != NULL) {
7181                 *dirty_page_count = local_dirty_count;
7182         }
7183
7184 out:
7185         return kr;
7186 }
7187
7188
7189 #if     TASK_SWAPPER
7190 /*
7191  * vm_object_res_deallocate
7192  *
7193  * (recursively) decrement residence counts on vm objects and their shadows.
7194  * Called from vm_object_deallocate and when swapping out an object.
7195  *
7196  * The object is locked, and remains locked throughout the function,
7197  * even as we iterate down the shadow chain.  Locks on intermediate objects
7198  * will be dropped, but not the original object.
7199  *
7200  * NOTE: this function used to use recursion, rather than iteration.
7201  */
7202
7203 __private_extern__ void
7204 vm_object_res_deallocate(
7205         vm_object_t     object)
7206 {
7207         vm_object_t orig_object = object;
7208         /*
7209          * Object is locked so it can be called directly
7210          * from vm_object_deallocate.  Original object is never
7211          * unlocked.
7212          */
7213         assert(object->res_count > 0);
7214         while  (--object->res_count == 0) {
7215                 assert(object->ref_count >= object->res_count);
7216                 vm_object_deactivate_all_pages(object);
7217                 /* iterate on shadow, if present */
7218                 if (object->shadow != VM_OBJECT_NULL) {
7219                         vm_object_t tmp_object = object->shadow;
7220                         vm_object_lock(tmp_object);
7221                         if (object != orig_object)
7222                                 vm_object_unlock(object);
7223                         object = tmp_object;
7224                         assert(object->res_count > 0);
7225                 } else
7226                         break;
7227         }
7228         if (object != orig_object)
7229                 vm_object_unlock(object);
7230 }
7231
7232 /*
7233  * vm_object_res_reference
7234  *
7235  * Internal function to increment residence count on a vm object
7236  * and its shadows.  It is called only from vm_object_reference, and
7237  * when swapping in a vm object, via vm_map_swap.
7238  *
7239  * The object is locked, and remains locked throughout the function,
7240  * even as we iterate down the shadow chain.  Locks on intermediate objects
7241  * will be dropped, but not the original object.
7242  *
7243  * NOTE: this function used to use recursion, rather than iteration.
7244  */
7245
7246 __private_extern__ void
7247 vm_object_res_reference(
7248         vm_object_t     object)
7249 {
7250         vm_object_t orig_object = object;
7251         /*
7252          * Object is locked, so this can be called directly
7253          * from vm_object_reference.  This lock is never released.
7254          */
7255         while  ((++object->res_count == 1)  &&
7256                 (object->shadow != VM_OBJECT_NULL)) {
7257                 vm_object_t tmp_object = object->shadow;
7258
7259                 assert(object->ref_count >= object->res_count);
7260                 vm_object_lock(tmp_object);
7261                 if (object != orig_object)
7262                         vm_object_unlock(object);
7263                 object = tmp_object;
7264         }
7265         if (object != orig_object)
7266                 vm_object_unlock(object);
7267         assert(orig_object->ref_count >= orig_object->res_count);
7268 }
7269 #endif  /* TASK_SWAPPER */
7270
7271 /*
7272  *      vm_object_reference:
7273  *
7274  *      Gets another reference to the given object.
7275  */
7276 #ifdef vm_object_reference
7277 #undef vm_object_reference
7278 #endif
7279 __private_extern__ void
7280 vm_object_reference(
7281         register vm_object_t    object)
7282 {
7283         if (object == VM_OBJECT_NULL)
7284                 return;
7285
7286         vm_object_lock(object);
7287         assert(object->ref_count > 0);
7288         vm_object_reference_locked(object);
7289         vm_object_unlock(object);
7290 }
7291
7292 #ifdef MACH_BSD
7293 /*
7294  * Scale the vm_object_cache
7295  * This is required to make sure that the vm_object_cache is big
7296  * enough to effectively cache the mapped file.
7297  * This is really important with UBC as all the regular file vnodes
7298  * have memory object associated with them. Havving this cache too
7299  * small results in rapid reclaim of vnodes and hurts performance a LOT!
7300  *
7301  * This is also needed as number of vnodes can be dynamically scaled.
7302  */
7303 kern_return_t
7304 adjust_vm_object_cache(
7305         __unused vm_size_t oval,
7306         __unused vm_size_t nval)
7307 {
7308 #if VM_OBJECT_CACHE
7309         vm_object_cached_max = nval;
7310         vm_object_cache_trim(FALSE);
7311 #endif
7312         return (KERN_SUCCESS);
7313 }
7314 #endif /* MACH_BSD */
7315
7316
7317 /*
7318  * vm_object_transpose
7319  *
7320  * This routine takes two VM objects of the same size and exchanges
7321  * their backing store.
7322  * The objects should be "quiesced" via a UPL operation with UPL_SET_IO_WIRE
7323  * and UPL_BLOCK_ACCESS if they are referenced anywhere.
7324  *
7325  * The VM objects must not be locked by caller.
7326  */
7327 unsigned int vm_object_transpose_count = 0;
7328 kern_return_t
7329 vm_object_transpose(
7330         vm_object_t             object1,
7331         vm_object_t             object2,
7332         vm_object_size_t        transpose_size)
7333 {
7334         vm_object_t             tmp_object;
7335         kern_return_t           retval;
7336         boolean_t               object1_locked, object2_locked;
7337         vm_page_t               page;
7338         vm_object_offset_t      page_offset;
7339         lck_mtx_t               *hash_lck;
7340         vm_object_hash_entry_t  hash_entry;
7341
7342         tmp_object = VM_OBJECT_NULL;
7343         object1_locked = FALSE; object2_locked = FALSE;
7344
7345         if (object1 == object2 ||
7346             object1 == VM_OBJECT_NULL ||
7347             object2 == VM_OBJECT_NULL) {
7348                 /*
7349                  * If the 2 VM objects are the same, there's
7350                  * no point in exchanging their backing store.
7351                  */
7352                 retval = KERN_INVALID_VALUE;
7353                 goto done;
7354         }
7355
7356         /*
7357          * Since we need to lock both objects at the same time,
7358          * make sure we always lock them in the same order to
7359          * avoid deadlocks.
7360          */
7361         if (object1 >  object2) {
7362                 tmp_object = object1;
7363                 object1 = object2;
7364                 object2 = tmp_object;
7365         }
7366
7367         /*
7368          * Allocate a temporary VM object to hold object1's contents
7369          * while we copy object2 to object1.
7370          */
7371         tmp_object = vm_object_allocate(transpose_size);
7372         vm_object_lock(tmp_object);
7373         tmp_object->can_persist = FALSE;
7374
7375
7376         /*
7377          * Grab control of the 1st VM object.
7378          */
7379         vm_object_lock(object1);
7380         object1_locked = TRUE;
7381         if (!object1->alive || object1->terminating ||
7382             object1->copy || object1->shadow || object1->shadowed ||
7383             object1->purgable != VM_PURGABLE_DENY) {
7384                 /*
7385                  * We don't deal with copy or shadow objects (yet).
7386                  */
7387                 retval = KERN_INVALID_VALUE;
7388                 goto done;
7389         }
7390         /*
7391          * We're about to mess with the object's backing store and
7392          * taking a "paging_in_progress" reference wouldn't be enough
7393          * to prevent any paging activity on this object, so the caller should
7394          * have "quiesced" the objects beforehand, via a UPL operation with
7395          * UPL_SET_IO_WIRE (to make sure all the pages are there and wired)
7396          * and UPL_BLOCK_ACCESS (to mark the pages "busy").
7397          *
7398          * Wait for any paging operation to complete (but only paging, not
7399          * other kind of activities not linked to the pager).  After we're
7400          * statisfied that there's no more paging in progress, we keep the
7401          * object locked, to guarantee that no one tries to access its pager.
7402          */
7403         vm_object_paging_only_wait(object1, THREAD_UNINT);
7404
7405         /*
7406          * Same as above for the 2nd object...
7407          */
7408         vm_object_lock(object2);
7409         object2_locked = TRUE;
7410         if (! object2->alive || object2->terminating ||
7411             object2->copy || object2->shadow || object2->shadowed ||
7412             object2->purgable != VM_PURGABLE_DENY) {
7413                 retval = KERN_INVALID_VALUE;
7414                 goto done;
7415         }
7416         vm_object_paging_only_wait(object2, THREAD_UNINT);
7417
7418
7419         if (object1->vo_size != object2->vo_size ||
7420             object1->vo_size != transpose_size) {
7421                 /*
7422                  * If the 2 objects don't have the same size, we can't
7423                  * exchange their backing stores or one would overflow.
7424                  * If their size doesn't match the caller's
7425                  * "transpose_size", we can't do it either because the
7426                  * transpose operation will affect the entire span of
7427                  * the objects.
7428                  */
7429                 retval = KERN_INVALID_VALUE;
7430                 goto done;
7431         }
7432
7433
7434         /*
7435          * Transpose the lists of resident pages.
7436          * This also updates the resident_page_count and the memq_hint.
7437          */
7438         if (object1->phys_contiguous || queue_empty(&object1->memq)) {
7439                 /*
7440                  * No pages in object1, just transfer pages
7441                  * from object2 to object1.  No need to go through
7442                  * an intermediate object.
7443                  */
7444                 while (!queue_empty(&object2->memq)) {
7445                         page = (vm_page_t) queue_first(&object2->memq);
7446                         vm_page_rename(page, object1, page->offset, FALSE);
7447                 }
7448                 assert(queue_empty(&object2->memq));
7449         } else if (object2->phys_contiguous || queue_empty(&object2->memq)) {
7450                 /*
7451                  * No pages in object2, just transfer pages
7452                  * from object1 to object2.  No need to go through
7453                  * an intermediate object.
7454                  */
7455                 while (!queue_empty(&object1->memq)) {
7456                         page = (vm_page_t) queue_first(&object1->memq);
7457                         vm_page_rename(page, object2, page->offset, FALSE);
7458                 }
7459                 assert(queue_empty(&object1->memq));
7460         } else {
7461                 /* transfer object1's pages to tmp_object */
7462                 while (!queue_empty(&object1->memq)) {
7463                         page = (vm_page_t) queue_first(&object1->memq);
7464                         page_offset = page->offset;
7465                         vm_page_remove(page, TRUE);
7466                         page->offset = page_offset;
7467                         queue_enter(&tmp_object->memq, page, vm_page_t, listq);
7468                 }
7469                 assert(queue_empty(&object1->memq));
7470                 /* transfer object2's pages to object1 */
7471                 while (!queue_empty(&object2->memq)) {
7472                         page = (vm_page_t) queue_first(&object2->memq);
7473                         vm_page_rename(page, object1, page->offset, FALSE);
7474                 }
7475                 assert(queue_empty(&object2->memq));
7476                 /* transfer tmp_object's pages to object1 */
7477                 while (!queue_empty(&tmp_object->memq)) {
7478                         page = (vm_page_t) queue_first(&tmp_object->memq);
7479                         queue_remove(&tmp_object->memq, page,
7480                                      vm_page_t, listq);
7481                         vm_page_insert(page, object2, page->offset);
7482                 }
7483                 assert(queue_empty(&tmp_object->memq));
7484         }
7485
7486 #define __TRANSPOSE_FIELD(field)                                \
7487 MACRO_BEGIN                                                     \
7488         tmp_object->field = object1->field;                     \
7489         object1->field = object2->field;                        \
7490         object2->field = tmp_object->field;                     \
7491 MACRO_END
7492
7493         /* "Lock" refers to the object not its contents */
7494         /* "size" should be identical */
7495         assert(object1->vo_size == object2->vo_size);
7496         /* "memq_hint" was updated above when transposing pages */
7497         /* "ref_count" refers to the object not its contents */
7498 #if TASK_SWAPPER
7499         /* "res_count" refers to the object not its contents */
7500 #endif
7501         /* "resident_page_count" was updated above when transposing pages */
7502         /* "wired_page_count" was updated above when transposing pages */
7503         /* "reusable_page_count" was updated above when transposing pages */
7504         /* there should be no "copy" */
7505         assert(!object1->copy);
7506         assert(!object2->copy);
7507         /* there should be no "shadow" */
7508         assert(!object1->shadow);
7509         assert(!object2->shadow);
7510         __TRANSPOSE_FIELD(vo_shadow_offset); /* used by phys_contiguous objects */
7511         __TRANSPOSE_FIELD(pager);
7512         __TRANSPOSE_FIELD(paging_offset);
7513         __TRANSPOSE_FIELD(pager_control);
7514         /* update the memory_objects' pointers back to the VM objects */
7515         if (object1->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
7516                 memory_object_control_collapse(object1->pager_control,
7517                                                object1);
7518         }
7519         if (object2->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
7520                 memory_object_control_collapse(object2->pager_control,
7521                                                object2);
7522         }
7523         __TRANSPOSE_FIELD(copy_strategy);
7524         /* "paging_in_progress" refers to the object not its contents */
7525         assert(!object1->paging_in_progress);
7526         assert(!object2->paging_in_progress);
7527         assert(object1->activity_in_progress);
7528         assert(object2->activity_in_progress);
7529         /* "all_wanted" refers to the object not its contents */
7530         __TRANSPOSE_FIELD(pager_created);
7531         __TRANSPOSE_FIELD(pager_initialized);
7532         __TRANSPOSE_FIELD(pager_ready);
7533         __TRANSPOSE_FIELD(pager_trusted);
7534         __TRANSPOSE_FIELD(can_persist);
7535         __TRANSPOSE_FIELD(internal);
7536         __TRANSPOSE_FIELD(temporary);
7537         __TRANSPOSE_FIELD(private);
7538         __TRANSPOSE_FIELD(pageout);
7539         /* "alive" should be set */
7540         assert(object1->alive);
7541         assert(object2->alive);
7542         /* "purgeable" should be non-purgeable */
7543         assert(object1->purgable == VM_PURGABLE_DENY);
7544         assert(object2->purgable == VM_PURGABLE_DENY);
7545         /* "shadowed" refers to the the object not its contents */
7546         __TRANSPOSE_FIELD(purgeable_when_ripe);
7547         __TRANSPOSE_FIELD(advisory_pageout);
7548         __TRANSPOSE_FIELD(true_share);
7549         /* "terminating" should not be set */
7550         assert(!object1->terminating);
7551         assert(!object2->terminating);
7552         __TRANSPOSE_FIELD(named);
7553         /* "shadow_severed" refers to the object not its contents */
7554         __TRANSPOSE_FIELD(phys_contiguous);
7555         __TRANSPOSE_FIELD(nophyscache);
7556         /* "cached_list.next" points to transposed object */
7557         object1->cached_list.next = (queue_entry_t) object2;
7558         object2->cached_list.next = (queue_entry_t) object1;
7559         /* "cached_list.prev" should be NULL */
7560         assert(object1->cached_list.prev == NULL);
7561         assert(object2->cached_list.prev == NULL);
7562         /* "msr_q" is linked to the object not its contents */
7563         assert(queue_empty(&object1->msr_q));
7564         assert(queue_empty(&object2->msr_q));
7565         __TRANSPOSE_FIELD(last_alloc);
7566         __TRANSPOSE_FIELD(sequential);
7567         __TRANSPOSE_FIELD(pages_created);
7568         __TRANSPOSE_FIELD(pages_used);
7569         __TRANSPOSE_FIELD(scan_collisions);
7570 #if MACH_PAGEMAP
7571         __TRANSPOSE_FIELD(existence_map);
7572 #endif
7573         __TRANSPOSE_FIELD(cow_hint);
7574 #if MACH_ASSERT
7575         __TRANSPOSE_FIELD(paging_object);
7576 #endif
7577         __TRANSPOSE_FIELD(wimg_bits);
7578         __TRANSPOSE_FIELD(set_cache_attr);
7579         __TRANSPOSE_FIELD(code_signed);
7580         if (object1->hashed) {
7581                 hash_lck = vm_object_hash_lock_spin(object2->pager);
7582                 hash_entry = vm_object_hash_lookup(object2->pager, FALSE);
7583                 assert(hash_entry != VM_OBJECT_HASH_ENTRY_NULL);
7584                 hash_entry->object = object2;
7585                 vm_object_hash_unlock(hash_lck);
7586         }
7587         if (object2->hashed) {
7588                 hash_lck = vm_object_hash_lock_spin(object1->pager);
7589                 hash_entry = vm_object_hash_lookup(object1->pager, FALSE);
7590                 assert(hash_entry != VM_OBJECT_HASH_ENTRY_NULL);
7591                 hash_entry->object = object1;
7592                 vm_object_hash_unlock(hash_lck);
7593         }
7594         __TRANSPOSE_FIELD(hashed);
7595         object1->transposed = TRUE;
7596         object2->transposed = TRUE;
7597         __TRANSPOSE_FIELD(mapping_in_progress);
7598         __TRANSPOSE_FIELD(volatile_empty);
7599         __TRANSPOSE_FIELD(volatile_fault);
7600         __TRANSPOSE_FIELD(all_reusable);
7601         assert(object1->blocked_access);
7602         assert(object2->blocked_access);
7603         assert(object1->__object2_unused_bits == 0);
7604         assert(object2->__object2_unused_bits == 0);
7605 #if UPL_DEBUG
7606         /* "uplq" refers to the object not its contents (see upl_transpose()) */
7607 #endif
7608         assert(object1->objq.next == NULL);
7609         assert(object1->objq.prev == NULL);
7610         assert(object2->objq.next == NULL);
7611         assert(object2->objq.prev == NULL);
7612
7613 #undef __TRANSPOSE_FIELD
7614
7615         retval = KERN_SUCCESS;
7616
7617 done:
7618         /*
7619          * Cleanup.
7620          */
7621         if (tmp_object != VM_OBJECT_NULL) {
7622                 vm_object_unlock(tmp_object);
7623                 /*
7624                  * Re-initialize the temporary object to avoid
7625                  * deallocating a real pager.
7626                  */
7627                 _vm_object_allocate(transpose_size, tmp_object);
7628                 vm_object_deallocate(tmp_object);
7629                 tmp_object = VM_OBJECT_NULL;
7630         }
7631
7632         if (object1_locked) {
7633                 vm_object_unlock(object1);
7634                 object1_locked = FALSE;
7635         }
7636         if (object2_locked) {
7637                 vm_object_unlock(object2);
7638                 object2_locked = FALSE;
7639         }
7640
7641         vm_object_transpose_count++;
7642
7643         return retval;
7644 }
7645
7646
7647 /*
7648  *      vm_object_cluster_size
7649  *
7650  *      Determine how big a cluster we should issue an I/O for...
7651  *
7652  *      Inputs:   *start == offset of page needed
7653  *                *length == maximum cluster pager can handle
7654  *      Outputs:  *start == beginning offset of cluster
7655  *                *length == length of cluster to try
7656  *
7657  *      The original *start will be encompassed by the cluster
7658  *
7659  */
7660 extern int speculative_reads_disabled;
7661 extern int ignore_is_ssd;
7662
7663 unsigned int preheat_max_bytes = MAX_UPL_TRANSFER_BYTES;
7664 unsigned int preheat_min_bytes = (1024 * 32);
7665
7666
7667 __private_extern__ void
7668 vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start,
7669                        vm_size_t *length, vm_object_fault_info_t fault_info, uint32_t *io_streaming)
7670 {
7671         vm_size_t               pre_heat_size;
7672         vm_size_t               tail_size;
7673         vm_size_t               head_size;
7674         vm_size_t               max_length;
7675         vm_size_t               cluster_size;
7676         vm_object_offset_t      object_size;
7677         vm_object_offset_t      orig_start;
7678         vm_object_offset_t      target_start;
7679         vm_object_offset_t      offset;
7680         vm_behavior_t           behavior;
7681         boolean_t               look_behind = TRUE;
7682         boolean_t               look_ahead  = TRUE;
7683         boolean_t               isSSD = FALSE;
7684         uint32_t                throttle_limit;
7685         int                     sequential_run;
7686         int                     sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
7687         vm_size_t               max_ph_size;
7688         vm_size_t               min_ph_size;
7689
7690         assert( !(*length & PAGE_MASK));
7691         assert( !(*start & PAGE_MASK_64));
7692
7693         /*
7694          * remember maxiumum length of run requested
7695          */
7696         max_length = *length;
7697         /*
7698          * we'll always return a cluster size of at least
7699          * 1 page, since the original fault must always
7700          * be processed
7701          */
7702         *length = PAGE_SIZE;
7703         *io_streaming = 0;
7704
7705         if (speculative_reads_disabled || fault_info == NULL) {
7706                 /*
7707                  * no cluster... just fault the page in
7708                  */
7709                 return;
7710         }
7711         orig_start = *start;
7712         target_start = orig_start;
7713         cluster_size = round_page(fault_info->cluster_size);
7714         behavior = fault_info->behavior;
7715
7716         vm_object_lock(object);
7717
7718         if (object->pager == MEMORY_OBJECT_NULL)
7719                 goto out;       /* pager is gone for this object, nothing more to do */
7720
7721         if (!ignore_is_ssd)
7722                 vnode_pager_get_isSSD(object->pager, &isSSD);
7723
7724         min_ph_size = round_page(preheat_min_bytes);
7725         max_ph_size = round_page(preheat_max_bytes);
7726
7727         if (isSSD) {
7728                 min_ph_size /= 2;
7729                 max_ph_size /= 8;
7730         }
7731         if (min_ph_size < PAGE_SIZE)
7732                 min_ph_size = PAGE_SIZE;
7733
7734         if (max_ph_size < PAGE_SIZE)
7735                 max_ph_size = PAGE_SIZE;
7736         else if (max_ph_size > MAX_UPL_TRANSFER_BYTES)
7737                 max_ph_size = MAX_UPL_TRANSFER_BYTES;
7738
7739         if (max_length > max_ph_size)
7740                 max_length = max_ph_size;
7741
7742         if (max_length <= PAGE_SIZE)
7743                 goto out;
7744
7745         if (object->internal)
7746                 object_size = object->vo_size;
7747         else
7748                 vnode_pager_get_object_size(object->pager, &object_size);
7749
7750         object_size = round_page_64(object_size);
7751
7752         if (orig_start >= object_size) {
7753                 /*
7754                  * fault occurred beyond the EOF...
7755                  * we need to punt w/o changing the
7756                  * starting offset
7757                  */
7758                 goto out;
7759         }
7760         if (object->pages_used > object->pages_created) {
7761                 /*
7762                  * must have wrapped our 32 bit counters
7763                  * so reset
7764                  */
7765                 object->pages_used = object->pages_created = 0;
7766         }
7767         if ((sequential_run = object->sequential)) {
7768                   if (sequential_run < 0) {
7769                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
7770                           sequential_run = 0 - sequential_run;
7771                   } else {
7772                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
7773                   }
7774
7775         }
7776         switch (behavior) {
7777
7778         default:
7779                 behavior = VM_BEHAVIOR_DEFAULT;
7780
7781         case VM_BEHAVIOR_DEFAULT:
7782                 if (object->internal && fault_info->user_tag == VM_MEMORY_STACK)
7783                         goto out;
7784
7785                 if (sequential_run >= (3 * PAGE_SIZE)) {
7786                         pre_heat_size = sequential_run + PAGE_SIZE;
7787
7788                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL)
7789                                 look_behind = FALSE;
7790                         else
7791                                 look_ahead = FALSE;
7792
7793                         *io_streaming = 1;
7794                 } else {
7795
7796                         if (object->pages_created < (20 * (min_ph_size >> PAGE_SHIFT))) {
7797                                 /*
7798                                  * prime the pump
7799                                  */
7800                                 pre_heat_size = min_ph_size;
7801                         } else {
7802                                 /*
7803                                  * Linear growth in PH size: The maximum size is max_length...
7804                                  * this cacluation will result in a size that is neither a
7805                                  * power of 2 nor a multiple of PAGE_SIZE... so round
7806                                  * it up to the nearest PAGE_SIZE boundary
7807                                  */
7808                                 pre_heat_size = (max_length * object->pages_used) / object->pages_created;
7809
7810                                 if (pre_heat_size < min_ph_size)
7811                                         pre_heat_size = min_ph_size;
7812                                 else
7813                                         pre_heat_size = round_page(pre_heat_size);
7814                         }
7815                 }
7816                 break;
7817
7818         case VM_BEHAVIOR_RANDOM:
7819                 if ((pre_heat_size = cluster_size) <= PAGE_SIZE)
7820                         goto out;
7821                 break;
7822
7823         case VM_BEHAVIOR_SEQUENTIAL:
7824                 if ((pre_heat_size = cluster_size) == 0)
7825                         pre_heat_size = sequential_run + PAGE_SIZE;
7826                 look_behind = FALSE;
7827                 *io_streaming = 1;
7828
7829                 break;
7830
7831         case VM_BEHAVIOR_RSEQNTL:
7832                 if ((pre_heat_size = cluster_size) == 0)
7833                         pre_heat_size = sequential_run + PAGE_SIZE;
7834                 look_ahead = FALSE;
7835                 *io_streaming = 1;
7836
7837                 break;
7838
7839         }
7840         throttle_limit = (uint32_t) max_length;
7841         assert(throttle_limit == max_length);
7842
7843         if (vnode_pager_get_throttle_io_limit(object->pager, &throttle_limit) == KERN_SUCCESS) {
7844                 if (max_length > throttle_limit)
7845                         max_length = throttle_limit;
7846         }
7847         if (pre_heat_size > max_length)
7848                 pre_heat_size = max_length;
7849
7850         if (behavior == VM_BEHAVIOR_DEFAULT && (pre_heat_size > min_ph_size)) {
7851
7852                 unsigned int consider_free = vm_page_free_count + vm_page_cleaned_count;
7853
7854                 if (consider_free < vm_page_throttle_limit) {
7855                         pre_heat_size = trunc_page(pre_heat_size / 16);
7856                 } else if (consider_free < vm_page_free_target) {
7857                         pre_heat_size = trunc_page(pre_heat_size / 4);
7858                 }
7859
7860                 if (pre_heat_size < min_ph_size)
7861                         pre_heat_size = min_ph_size;
7862         }
7863         if (look_ahead == TRUE) {
7864                 if (look_behind == TRUE) {
7865                         /*
7866                          * if we get here its due to a random access...
7867                          * so we want to center the original fault address
7868                          * within the cluster we will issue... make sure
7869                          * to calculate 'head_size' as a multiple of PAGE_SIZE...
7870                          * 'pre_heat_size' is a multiple of PAGE_SIZE but not
7871                          * necessarily an even number of pages so we need to truncate
7872                          * the result to a PAGE_SIZE boundary
7873                          */
7874                         head_size = trunc_page(pre_heat_size / 2);
7875
7876                         if (target_start > head_size)
7877                                 target_start -= head_size;
7878                         else
7879                                 target_start = 0;
7880
7881                         /*
7882                          * 'target_start' at this point represents the beginning offset
7883                          * of the cluster we are considering... 'orig_start' will be in
7884                          * the center of this cluster if we didn't have to clip the start
7885                          * due to running into the start of the file
7886                          */
7887                 }
7888                 if ((target_start + pre_heat_size) > object_size)
7889                         pre_heat_size = (vm_size_t)(round_page_64(object_size - target_start));
7890                 /*
7891                  * at this point caclulate the number of pages beyond the original fault
7892                  * address that we want to consider... this is guaranteed not to extend beyond
7893                  * the current EOF...
7894                  */
7895                 assert((vm_size_t)(orig_start - target_start) == (orig_start - target_start));
7896                 tail_size = pre_heat_size - (vm_size_t)(orig_start - target_start) - PAGE_SIZE;
7897         } else {
7898                 if (pre_heat_size > target_start) {
7899                         /*
7900                          * since pre_heat_size is always smaller then 2^32,
7901                          * if it is larger then target_start (a 64 bit value)
7902                          * it is safe to clip target_start to 32 bits
7903                          */
7904                         pre_heat_size = (vm_size_t) target_start;
7905                 }
7906                 tail_size = 0;
7907         }
7908         assert( !(target_start & PAGE_MASK_64));
7909         assert( !(pre_heat_size & PAGE_MASK));
7910
7911         if (pre_heat_size <= PAGE_SIZE)
7912                 goto out;
7913
7914         if (look_behind == TRUE) {
7915                 /*
7916                  * take a look at the pages before the original
7917                  * faulting offset... recalculate this in case
7918                  * we had to clip 'pre_heat_size' above to keep
7919                  * from running past the EOF.
7920                  */
7921                 head_size = pre_heat_size - tail_size - PAGE_SIZE;
7922
7923                 for (offset = orig_start - PAGE_SIZE_64; head_size; offset -= PAGE_SIZE_64, head_size -= PAGE_SIZE) {
7924                         /*
7925                          * don't poke below the lowest offset
7926                          */
7927                         if (offset < fault_info->lo_offset)
7928                                 break;
7929                         /*
7930                          * for external objects and internal objects w/o an existence map
7931                          * vm_externl_state_get will return VM_EXTERNAL_STATE_UNKNOWN
7932                          */
7933 #if MACH_PAGEMAP
7934                         if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_ABSENT) {
7935                                 /*
7936                                  * we know for a fact that the pager can't provide the page
7937                                  * so don't include it or any pages beyond it in this cluster
7938                                  */
7939                                 break;
7940                         }
7941 #endif /* MACH_PAGEMAP */
7942                         if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
7943                             == VM_EXTERNAL_STATE_ABSENT) {
7944                                 break;
7945                         }
7946                         if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
7947                                 /*
7948                                  * don't bridge resident pages
7949                                  */
7950                                 break;
7951                         }
7952                         *start = offset;
7953                         *length += PAGE_SIZE;
7954                 }
7955         }
7956         if (look_ahead == TRUE) {
7957                 for (offset = orig_start + PAGE_SIZE_64; tail_size; offset += PAGE_SIZE_64, tail_size -= PAGE_SIZE) {
7958                         /*
7959                          * don't poke above the highest offset
7960                          */
7961                         if (offset >= fault_info->hi_offset)
7962                                 break;
7963                         assert(offset < object_size);
7964
7965                         /*
7966                          * for external objects and internal objects w/o an existence map
7967                          * vm_externl_state_get will return VM_EXTERNAL_STATE_UNKNOWN
7968                          */
7969 #if MACH_PAGEMAP
7970                         if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_ABSENT) {
7971                                 /*
7972                                  * we know for a fact that the pager can't provide the page
7973                                  * so don't include it or any pages beyond it in this cluster
7974                                  */
7975                                 break;
7976                         }
7977 #endif /* MACH_PAGEMAP */
7978                         if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) == VM_EXTERNAL_STATE_ABSENT) {
7979                                 break;
7980                         }
7981                         if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
7982                                 /*
7983                                  * don't bridge resident pages
7984                                  */
7985                                 break;
7986                         }
7987                         *length += PAGE_SIZE;
7988                 }
7989         }
7990 out:
7991         if (*length > max_length)
7992                 *length = max_length;
7993
7994         vm_object_unlock(object);
7995
7996         DTRACE_VM1(clustersize, vm_size_t, *length);
7997 }
7998
7999
8000 /*
8001  * Allow manipulation of individual page state.  This is actually part of
8002  * the UPL regimen but takes place on the VM object rather than on a UPL
8003  */
8004
8005 kern_return_t
8006 vm_object_page_op(
8007         vm_object_t             object,
8008         vm_object_offset_t      offset,
8009         int                     ops,
8010         ppnum_t                 *phys_entry,
8011         int                     *flags)
8012 {
8013         vm_page_t               dst_page;
8014
8015         vm_object_lock(object);
8016
8017         if(ops & UPL_POP_PHYSICAL) {
8018                 if(object->phys_contiguous) {
8019                         if (phys_entry) {
8020                                 *phys_entry = (ppnum_t)
8021                                         (object->vo_shadow_offset >> PAGE_SHIFT);
8022                         }
8023                         vm_object_unlock(object);
8024                         return KERN_SUCCESS;
8025                 } else {
8026                         vm_object_unlock(object);
8027                         return KERN_INVALID_OBJECT;
8028                 }
8029         }
8030         if(object->phys_contiguous) {
8031                 vm_object_unlock(object);
8032                 return KERN_INVALID_OBJECT;
8033         }
8034
8035         while(TRUE) {
8036                 if((dst_page = vm_page_lookup(object,offset)) == VM_PAGE_NULL) {
8037                         vm_object_unlock(object);
8038                         return KERN_FAILURE;
8039                 }
8040
8041                 /* Sync up on getting the busy bit */
8042                 if((dst_page->busy || dst_page->cleaning) &&
8043                            (((ops & UPL_POP_SET) &&
8044                            (ops & UPL_POP_BUSY)) || (ops & UPL_POP_DUMP))) {
8045                         /* someone else is playing with the page, we will */
8046                         /* have to wait */
8047                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
8048                         continue;
8049                 }
8050
8051                 if (ops & UPL_POP_DUMP) {
8052                         if (dst_page->pmapped == TRUE)
8053                                 pmap_disconnect(dst_page->phys_page);
8054
8055                         VM_PAGE_FREE(dst_page);
8056                         break;
8057                 }
8058
8059                 if (flags) {
8060                         *flags = 0;
8061
8062                         /* Get the condition of flags before requested ops */
8063                         /* are undertaken */
8064
8065                         if(dst_page->dirty) *flags |= UPL_POP_DIRTY;
8066                         if(dst_page->pageout) *flags |= UPL_POP_PAGEOUT;
8067                         if(dst_page->precious) *flags |= UPL_POP_PRECIOUS;
8068                         if(dst_page->absent) *flags |= UPL_POP_ABSENT;
8069                         if(dst_page->busy) *flags |= UPL_POP_BUSY;
8070                 }
8071
8072                 /* The caller should have made a call either contingent with */
8073                 /* or prior to this call to set UPL_POP_BUSY */
8074                 if(ops & UPL_POP_SET) {
8075                         /* The protection granted with this assert will */
8076                         /* not be complete.  If the caller violates the */
8077                         /* convention and attempts to change page state */
8078                         /* without first setting busy we may not see it */
8079                         /* because the page may already be busy.  However */
8080                         /* if such violations occur we will assert sooner */
8081                         /* or later. */
8082                         assert(dst_page->busy || (ops & UPL_POP_BUSY));
8083                         if (ops & UPL_POP_DIRTY) {
8084                                 SET_PAGE_DIRTY(dst_page, FALSE);
8085                         }
8086                         if (ops & UPL_POP_PAGEOUT) dst_page->pageout = TRUE;
8087                         if (ops & UPL_POP_PRECIOUS) dst_page->precious = TRUE;
8088                         if (ops & UPL_POP_ABSENT) dst_page->absent = TRUE;
8089                         if (ops & UPL_POP_BUSY) dst_page->busy = TRUE;
8090                 }
8091
8092                 if(ops & UPL_POP_CLR) {
8093                         assert(dst_page->busy);
8094                         if (ops & UPL_POP_DIRTY) dst_page->dirty = FALSE;
8095                         if (ops & UPL_POP_PAGEOUT) dst_page->pageout = FALSE;
8096                         if (ops & UPL_POP_PRECIOUS) dst_page->precious = FALSE;
8097                         if (ops & UPL_POP_ABSENT) dst_page->absent = FALSE;
8098                         if (ops & UPL_POP_BUSY) {
8099                                 dst_page->busy = FALSE;
8100                                 PAGE_WAKEUP(dst_page);
8101                         }
8102                 }
8103
8104                 if (dst_page->encrypted) {
8105                         /*
8106                          * ENCRYPTED SWAP:
8107                          * We need to decrypt this encrypted page before the
8108                          * caller can access its contents.
8109                          * But if the caller really wants to access the page's
8110                          * contents, they have to keep the page "busy".
8111                          * Otherwise, the page could get recycled or re-encrypted
8112                          * at any time.
8113                          */
8114                         if ((ops & UPL_POP_SET) && (ops & UPL_POP_BUSY) &&
8115                             dst_page->busy) {
8116                                 /*
8117                                  * The page is stable enough to be accessed by
8118                                  * the caller, so make sure its contents are
8119                                  * not encrypted.
8120                                  */
8121                                 vm_page_decrypt(dst_page, 0);
8122                         } else {
8123                                 /*
8124                                  * The page is not busy, so don't bother
8125                                  * decrypting it, since anything could
8126                                  * happen to it between now and when the
8127                                  * caller wants to access it.
8128                                  * We should not give the caller access
8129                                  * to this page.
8130                                  */
8131                                 assert(!phys_entry);
8132                         }
8133                 }
8134
8135                 if (phys_entry) {
8136                         /*
8137                          * The physical page number will remain valid
8138                          * only if the page is kept busy.
8139                          * ENCRYPTED SWAP: make sure we don't let the
8140                          * caller access an encrypted page.
8141                          */
8142                         assert(dst_page->busy);
8143                         assert(!dst_page->encrypted);
8144                         *phys_entry = dst_page->phys_page;
8145                 }
8146
8147                 break;
8148         }
8149
8150         vm_object_unlock(object);
8151         return KERN_SUCCESS;
8152
8153 }
8154
8155 /*
8156  * vm_object_range_op offers performance enhancement over
8157  * vm_object_page_op for page_op functions which do not require page
8158  * level state to be returned from the call.  Page_op was created to provide
8159  * a low-cost alternative to page manipulation via UPLs when only a single
8160  * page was involved.  The range_op call establishes the ability in the _op
8161  * family of functions to work on multiple pages where the lack of page level
8162  * state handling allows the caller to avoid the overhead of the upl structures.
8163  */
8164
8165 kern_return_t
8166 vm_object_range_op(
8167         vm_object_t             object,
8168         vm_object_offset_t      offset_beg,
8169         vm_object_offset_t      offset_end,
8170         int                     ops,
8171         uint32_t                *range)
8172 {
8173         vm_object_offset_t      offset;
8174         vm_page_t               dst_page;
8175
8176         if (offset_end - offset_beg > (uint32_t) -1) {
8177                 /* range is too big and would overflow "*range" */
8178                 return KERN_INVALID_ARGUMENT;
8179         }
8180         if (object->resident_page_count == 0) {
8181                 if (range) {
8182                         if (ops & UPL_ROP_PRESENT) {
8183                                 *range = 0;
8184                         } else {
8185                                 *range = (uint32_t) (offset_end - offset_beg);
8186                                 assert(*range == (offset_end - offset_beg));
8187                         }
8188                 }
8189                 return KERN_SUCCESS;
8190         }
8191         vm_object_lock(object);
8192
8193         if (object->phys_contiguous) {
8194                 vm_object_unlock(object);
8195                 return KERN_INVALID_OBJECT;
8196         }
8197
8198         offset = offset_beg & ~PAGE_MASK_64;
8199
8200         while (offset < offset_end) {
8201                 dst_page = vm_page_lookup(object, offset);
8202                 if (dst_page != VM_PAGE_NULL) {
8203                         if (ops & UPL_ROP_DUMP) {
8204                                 if (dst_page->busy || dst_page->cleaning) {
8205                                         /*
8206                                          * someone else is playing with the
8207                                          * page, we will have to wait
8208                                          */
8209                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
8210                                         /*
8211                                          * need to relook the page up since it's
8212                                          * state may have changed while we slept
8213                                          * it might even belong to a different object
8214                                          * at this point
8215                                          */
8216                                         continue;
8217                                 }
8218                                 if (dst_page->laundry) {
8219                                         dst_page->pageout = FALSE;
8220
8221                                         vm_pageout_steal_laundry(dst_page, FALSE);
8222                                 }
8223                                 if (dst_page->pmapped == TRUE)
8224                                         pmap_disconnect(dst_page->phys_page);
8225
8226                                 VM_PAGE_FREE(dst_page);
8227
8228                         } else if ((ops & UPL_ROP_ABSENT) && !dst_page->absent)
8229                                 break;
8230                 } else if (ops & UPL_ROP_PRESENT)
8231                         break;
8232
8233                 offset += PAGE_SIZE;
8234         }
8235         vm_object_unlock(object);
8236
8237         if (range) {
8238                 if (offset > offset_end)
8239                         offset = offset_end;
8240                 if(offset > offset_beg) {
8241                         *range = (uint32_t) (offset - offset_beg);
8242                         assert(*range == (offset - offset_beg));
8243                 } else {
8244                         *range = 0;
8245                 }
8246         }
8247         return KERN_SUCCESS;
8248 }
8249
8250 /*
8251  * Used to point a pager directly to a range of memory (when the pager may be associated
8252  *   with a non-device vnode).  Takes a virtual address, an offset, and a size.  We currently
8253  *   expect that the virtual address will denote the start of a range that is physically contiguous.
8254  */
8255 kern_return_t pager_map_to_phys_contiguous(
8256         memory_object_control_t object,
8257         memory_object_offset_t  offset,
8258         addr64_t                base_vaddr,
8259         vm_size_t               size)
8260 {
8261         ppnum_t page_num;
8262         boolean_t clobbered_private;
8263         kern_return_t retval;
8264         vm_object_t pager_object;
8265
8266         page_num = pmap_find_phys(kernel_pmap, base_vaddr);
8267
8268         if (!page_num) {
8269                 retval = KERN_FAILURE;
8270                 goto out;
8271         }
8272
8273         pager_object = memory_object_control_to_vm_object(object);
8274
8275         if (!pager_object) {
8276                 retval = KERN_FAILURE;
8277                 goto out;
8278         }
8279
8280         clobbered_private = pager_object->private;
8281         pager_object->private = TRUE;
8282         retval = vm_object_populate_with_private(pager_object, offset, page_num, size);
8283
8284         if (retval != KERN_SUCCESS)
8285                 pager_object->private = clobbered_private;
8286
8287 out:
8288         return retval;
8289 }
8290
8291 uint32_t scan_object_collision = 0;
8292
8293 void
8294 vm_object_lock(vm_object_t object)
8295 {
8296         if (object == vm_pageout_scan_wants_object) {
8297                 scan_object_collision++;
8298                 mutex_pause(2);
8299         }
8300         lck_rw_lock_exclusive(&object->Lock);
8301 }
8302
8303 boolean_t
8304 vm_object_lock_avoid(vm_object_t object)
8305 {
8306         if (object == vm_pageout_scan_wants_object) {
8307                 scan_object_collision++;
8308                 return TRUE;
8309         }
8310         return FALSE;
8311 }
8312
8313 boolean_t
8314 _vm_object_lock_try(vm_object_t object)
8315 {
8316         return (lck_rw_try_lock_exclusive(&object->Lock));
8317 }
8318
8319 boolean_t
8320 vm_object_lock_try(vm_object_t object)
8321 {
8322         /*
8323          * Called from hibernate path so check before blocking.
8324          */
8325         if (vm_object_lock_avoid(object) && ml_get_interrupts_enabled() && get_preemption_level()==0) {
8326                 mutex_pause(2);
8327         }
8328         return _vm_object_lock_try(object);
8329 }
8330
8331 void
8332 vm_object_lock_shared(vm_object_t object)
8333 {
8334         if (vm_object_lock_avoid(object)) {
8335                 mutex_pause(2);
8336         }
8337         lck_rw_lock_shared(&object->Lock);
8338 }
8339
8340 boolean_t
8341 vm_object_lock_try_shared(vm_object_t object)
8342 {
8343         if (vm_object_lock_avoid(object)) {
8344                 mutex_pause(2);
8345         }
8346         return (lck_rw_try_lock_shared(&object->Lock));
8347 }
8348
8349
8350 unsigned int vm_object_change_wimg_mode_count = 0;
8351
8352 /*
8353  * The object must be locked
8354  */
8355 void
8356 vm_object_change_wimg_mode(vm_object_t object, unsigned int wimg_mode)
8357 {
8358         vm_page_t p;
8359
8360         vm_object_lock_assert_exclusive(object);
8361
8362         vm_object_paging_wait(object, THREAD_UNINT);
8363
8364         queue_iterate(&object->memq, p, vm_page_t, listq) {
8365
8366                 if (!p->fictitious)
8367                         pmap_set_cache_attributes(p->phys_page, wimg_mode);
8368         }
8369         if (wimg_mode == VM_WIMG_USE_DEFAULT)
8370                 object->set_cache_attr = FALSE;
8371         else
8372                 object->set_cache_attr = TRUE;
8373
8374         object->wimg_bits = wimg_mode;
8375
8376         vm_object_change_wimg_mode_count++;
8377 }
8378
8379 #if CONFIG_FREEZE
8380
8381 kern_return_t vm_object_pack(
8382         unsigned int    *purgeable_count,
8383         unsigned int    *wired_count,
8384         unsigned int    *clean_count,
8385         unsigned int    *dirty_count,
8386         unsigned int    dirty_budget,
8387         boolean_t       *shared,
8388         vm_object_t     src_object,
8389         struct default_freezer_handle *df_handle)
8390 {
8391         kern_return_t   kr = KERN_SUCCESS;
8392
8393         vm_object_lock(src_object);
8394
8395         *purgeable_count = *wired_count = *clean_count = *dirty_count = 0;
8396         *shared = FALSE;
8397
8398         if (!src_object->alive || src_object->terminating){
8399                 kr = KERN_FAILURE;
8400                 goto done;
8401         }
8402
8403         if (src_object->purgable == VM_PURGABLE_VOLATILE) {
8404                 *purgeable_count = src_object->resident_page_count;
8405
8406                 /* If the default freezer handle is null, we're just walking the pages to discover how many can be hibernated */
8407                 if (df_handle != NULL) {
8408                         purgeable_q_t queue;
8409                         /* object should be on a queue */
8410                         assert(src_object->objq.next != NULL &&
8411                                src_object->objq.prev != NULL);
8412
8413                         queue = vm_purgeable_object_remove(src_object);
8414                         assert(queue);
8415                         if (src_object->purgeable_when_ripe) {
8416                                 vm_page_lock_queues();
8417                                 vm_purgeable_token_delete_first(queue);
8418                                 vm_page_unlock_queues();
8419                         }
8420
8421                         vm_object_purge(src_object, 0);
8422                         assert(src_object->purgable == VM_PURGABLE_EMPTY);
8423
8424                         /*
8425                          * This object was "volatile" so its pages must have
8426                          * already been accounted as "volatile": no change
8427                          * in accounting now that it's "empty".
8428                          */
8429                 }
8430                 goto done;
8431         }
8432
8433         if (src_object->ref_count == 1) {
8434                 vm_object_pack_pages(wired_count, clean_count, dirty_count, dirty_budget, src_object, df_handle);
8435         } else {
8436                 if (src_object->internal) {
8437                         *shared = TRUE;
8438                 }
8439         }
8440 done:
8441         vm_object_unlock(src_object);
8442
8443         return kr;
8444 }
8445
8446
8447 void
8448 vm_object_pack_pages(
8449         unsigned int            *wired_count,
8450         unsigned int            *clean_count,
8451         unsigned int            *dirty_count,
8452         unsigned int            dirty_budget,
8453         vm_object_t             src_object,
8454         struct default_freezer_handle *df_handle)
8455 {
8456         vm_page_t p, next;
8457
8458         next = (vm_page_t)queue_first(&src_object->memq);
8459
8460         while (!queue_end(&src_object->memq, (queue_entry_t)next)) {
8461                 p = next;
8462                 next = (vm_page_t)queue_next(&next->listq);
8463
8464                 /* Finish up if we've hit our pageout limit */
8465                 if (dirty_budget && (dirty_budget == *dirty_count)) {
8466                         break;
8467                 }
8468                 assert(!p->laundry);
8469
8470                 if (p->fictitious || p->busy )
8471                         continue;
8472
8473                 if (p->absent || p->unusual || p->error)
8474                         continue;
8475
8476                 if (VM_PAGE_WIRED(p)) {
8477                         (*wired_count)++;
8478                         continue;
8479                 }
8480
8481                 if (df_handle == NULL) {
8482                         if (p->dirty || pmap_is_modified(p->phys_page)) {
8483                                 (*dirty_count)++;
8484                         } else {
8485                                 (*clean_count)++;
8486                         }
8487                         continue;
8488                 }
8489
8490                 if (p->cleaning) {
8491                         p->pageout = TRUE;
8492                         continue;
8493                 }
8494
8495                 if (p->pmapped == TRUE) {
8496                         int refmod_state;
8497                         refmod_state = pmap_disconnect(p->phys_page);
8498                         if (refmod_state & VM_MEM_MODIFIED) {
8499                                 SET_PAGE_DIRTY(p, FALSE);
8500                         }
8501                 }
8502
8503                 if (p->dirty) {
8504                         default_freezer_pack_page(p, df_handle);
8505                         (*dirty_count)++;
8506                 }
8507                 else {
8508                         VM_PAGE_FREE(p);
8509                         (*clean_count)++;
8510                 }
8511         }
8512 }
8513
8514 void
8515 vm_object_pageout(
8516         vm_object_t object)
8517 {
8518         vm_page_t                       p, next;
8519         struct  vm_pageout_queue        *iq;
8520         boolean_t                       set_pageout_bit = FALSE;
8521
8522         iq = &vm_pageout_queue_internal;
8523
8524         assert(object != VM_OBJECT_NULL );
8525
8526         vm_object_lock(object);
8527
8528         if (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE) {
8529                 if (!object->pager_initialized) {
8530                         /*
8531                         *   If there is no memory object for the page, create
8532                         *   one and hand it to the default pager.
8533                         */
8534                         vm_object_pager_create(object);
8535                 }
8536
8537                 set_pageout_bit = TRUE;
8538         }
8539
8540         if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
8541
8542                 set_pageout_bit = FALSE;
8543         }
8544
8545 ReScan:
8546         next = (vm_page_t)queue_first(&object->memq);
8547
8548         while (!queue_end(&object->memq, (queue_entry_t)next)) {
8549                 p = next;
8550                 next = (vm_page_t)queue_next(&next->listq);
8551
8552                 /* Throw to the pageout queue */
8553                 vm_page_lockspin_queues();
8554
8555                 /*
8556                  * see if page is already in the process of
8557                  * being cleaned... if so, leave it alone
8558                  */
8559                 if (!p->laundry) {
8560
8561                         if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
8562
8563                                 if (VM_PAGE_Q_THROTTLED(iq)) {
8564
8565                                         iq->pgo_draining = TRUE;
8566
8567                                         assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
8568                                         vm_page_unlock_queues();
8569                                         vm_object_unlock(object);
8570
8571                                         thread_block(THREAD_CONTINUE_NULL);
8572
8573                                         vm_object_lock(object);
8574                                         goto ReScan;
8575                                 }
8576
8577                                 if (p->fictitious || p->busy ) {
8578                                         vm_page_unlock_queues();
8579                                         continue;
8580                                 }
8581
8582                                 if (p->absent || p->unusual || p->error || VM_PAGE_WIRED(p)) {
8583                                         vm_page_unlock_queues();
8584                                         continue;
8585                                 }
8586
8587                                 if (p->cleaning) {
8588                                         p->pageout = TRUE;
8589                                         vm_page_unlock_queues();
8590                                         continue;
8591                                 }
8592
8593                                 if (p->pmapped == TRUE) {
8594                                         int refmod_state;
8595                                         refmod_state = pmap_disconnect_options(p->phys_page, PMAP_OPTIONS_COMPRESSOR, NULL);
8596                                         if (refmod_state & VM_MEM_MODIFIED) {
8597                                                 SET_PAGE_DIRTY(p, FALSE);
8598                                         }
8599                                 }
8600
8601                                 if (p->dirty == FALSE) {
8602                                         vm_page_unlock_queues();
8603                                         VM_PAGE_FREE(p);
8604                                         continue;
8605                                 }
8606                         }
8607
8608                         VM_PAGE_QUEUES_REMOVE(p);
8609                         vm_pageout_cluster(p, set_pageout_bit);
8610                 }
8611                 vm_page_unlock_queues();
8612         }
8613
8614         vm_object_unlock(object);
8615 }
8616
8617 kern_return_t
8618 vm_object_pagein(
8619         vm_object_t object)
8620 {
8621         memory_object_t pager;
8622         kern_return_t   kr;
8623
8624         vm_object_lock(object);
8625
8626         pager = object->pager;
8627
8628         if (!object->pager_ready || pager == MEMORY_OBJECT_NULL) {
8629                 vm_object_unlock(object);
8630                 return KERN_FAILURE;
8631         }
8632
8633         vm_object_paging_wait(object, THREAD_UNINT);
8634         vm_object_paging_begin(object);
8635
8636         object->blocked_access = TRUE;
8637         vm_object_unlock(object);
8638
8639         kr = memory_object_data_reclaim(pager, TRUE);
8640
8641         vm_object_lock(object);
8642
8643         object->blocked_access = FALSE;
8644         vm_object_paging_end(object);
8645
8646         vm_object_unlock(object);
8647
8648         return kr;
8649 }
8650 #endif /* CONFIG_FREEZE */
8651
8652
8653 #if CONFIG_IOSCHED
8654 void
8655 vm_page_request_reprioritize(vm_object_t o, uint64_t blkno, uint32_t len, int prio)
8656 {
8657         io_reprioritize_req_t   req;
8658         struct vnode            *devvp = NULL;
8659
8660         if(vnode_pager_get_object_devvp(o->pager, (uintptr_t *)&devvp) != KERN_SUCCESS)
8661                 return;
8662
8663         /* Create the request for I/O reprioritization */
8664         req = (io_reprioritize_req_t)zalloc(io_reprioritize_req_zone);
8665         assert(req != NULL);
8666         req->blkno = blkno;
8667         req->len = len;
8668         req->priority = prio;
8669         req->devvp = devvp;
8670
8671         /* Insert request into the reprioritization list */
8672         IO_REPRIORITIZE_LIST_LOCK();
8673         queue_enter(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list);
8674         IO_REPRIORITIZE_LIST_UNLOCK();
8675
8676         /* Wakeup reprioritize thread */
8677         IO_REPRIO_THREAD_WAKEUP();
8678
8679         return;
8680 }
8681
8682 void
8683 vm_decmp_upl_reprioritize(upl_t upl, int prio)
8684 {
8685         int offset;
8686         vm_object_t object;
8687         io_reprioritize_req_t   req;
8688         struct vnode            *devvp = NULL;
8689         uint64_t                blkno;
8690         uint32_t                len;
8691         upl_t                   io_upl;
8692         uint64_t                *io_upl_reprio_info;
8693         int                     io_upl_size;
8694
8695         if ((upl->flags & UPL_TRACKED_BY_OBJECT) == 0 || (upl->flags & UPL_EXPEDITE_SUPPORTED) == 0)
8696                 return;
8697
8698         /*
8699          * We dont want to perform any allocations with the upl lock held since that might
8700          * result in a deadlock. If the system is low on memory, the pageout thread would
8701          * try to pageout stuff and might wait on this lock. If we are waiting for the memory to
8702          * be freed up by the pageout thread, it would be a deadlock.
8703          */
8704
8705
8706         /* First step is just to get the size of the upl to find out how big the reprio info is */
8707         if(!upl_try_lock(upl))
8708                 return;
8709
8710         if (upl->decmp_io_upl == NULL) {
8711                 /* The real I/O upl was destroyed by the time we came in here. Nothing to do. */
8712                 upl_unlock(upl);
8713                 return;
8714         }
8715
8716         io_upl = upl->decmp_io_upl;
8717         assert((io_upl->flags & UPL_DECMP_REAL_IO) != 0);
8718         io_upl_size = io_upl->size;
8719         upl_unlock(upl);
8720
8721         /* Now perform the allocation */
8722         io_upl_reprio_info = (uint64_t *)kalloc(sizeof(uint64_t) * (io_upl_size / PAGE_SIZE));
8723         if (io_upl_reprio_info == NULL)
8724                 return;
8725
8726         /* Now again take the lock, recheck the state and grab out the required info */
8727         if(!upl_try_lock(upl))
8728                 goto out;
8729
8730         if (upl->decmp_io_upl == NULL || upl->decmp_io_upl != io_upl) {
8731                 /* The real I/O upl was destroyed by the time we came in here. Nothing to do. */
8732                 upl_unlock(upl);
8733                 goto out;
8734         }
8735         memcpy(io_upl_reprio_info, io_upl->upl_reprio_info, sizeof(uint64_t) * (io_upl_size / PAGE_SIZE));
8736
8737         /* Get the VM object for this UPL */
8738         if (io_upl->flags & UPL_SHADOWED) {
8739                 object = io_upl->map_object->shadow;
8740         } else {
8741                 object = io_upl->map_object;
8742         }
8743
8744         /* Get the dev vnode ptr for this object */
8745         if(!object || !object->pager ||
8746            vnode_pager_get_object_devvp(object->pager, (uintptr_t *)&devvp) != KERN_SUCCESS) {
8747                 upl_unlock(upl);
8748                 goto out;
8749         }
8750
8751         upl_unlock(upl);
8752
8753         /* Now we have all the information needed to do the expedite */
8754
8755         offset = 0;
8756         while (offset < io_upl_size) {
8757                 blkno   = io_upl_reprio_info[(offset / PAGE_SIZE)] & UPL_REPRIO_INFO_MASK;
8758                 len     = (io_upl_reprio_info[(offset / PAGE_SIZE)] >> UPL_REPRIO_INFO_SHIFT) & UPL_REPRIO_INFO_MASK;
8759
8760                 /*
8761                  * This implementation may cause some spurious expedites due to the
8762                  * fact that we dont cleanup the blkno & len from the upl_reprio_info
8763                  * even after the I/O is complete.
8764                  */
8765
8766                 if (blkno != 0 && len != 0) {
8767                         /* Create the request for I/O reprioritization */
8768                         req = (io_reprioritize_req_t)zalloc(io_reprioritize_req_zone);
8769                         assert(req != NULL);
8770                         req->blkno = blkno;
8771                         req->len = len;
8772                         req->priority = prio;
8773                         req->devvp = devvp;
8774
8775                         /* Insert request into the reprioritization list */
8776                         IO_REPRIORITIZE_LIST_LOCK();
8777                         queue_enter(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list);
8778                         IO_REPRIORITIZE_LIST_UNLOCK();
8779
8780                         offset += len;
8781                 } else {
8782                         offset += PAGE_SIZE;
8783                 }
8784         }
8785
8786         /* Wakeup reprioritize thread */
8787         IO_REPRIO_THREAD_WAKEUP();
8788
8789 out:
8790         kfree(io_upl_reprio_info, sizeof(uint64_t) * (io_upl_size / PAGE_SIZE));
8791         return;
8792 }
8793
8794 void
8795 vm_page_handle_prio_inversion(vm_object_t o, vm_page_t m)
8796 {
8797         upl_t upl;
8798         upl_page_info_t *pl;
8799         unsigned int i, num_pages;
8800         int cur_tier;
8801
8802         cur_tier = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
8803
8804         /*
8805         Scan through all UPLs associated with the object to find the
8806         UPL containing the contended page.
8807         */
8808         queue_iterate(&o->uplq, upl, upl_t, uplq) {
8809                 if (((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) || upl->upl_priority <= cur_tier)
8810                         continue;
8811                 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
8812                 num_pages = (upl->size / PAGE_SIZE);
8813
8814                 /*
8815                 For each page in the UPL page list, see if it matches the contended
8816                 page and was issued as a low prio I/O.
8817                 */
8818                 for(i=0; i < num_pages; i++) {
8819                         if(UPL_PAGE_PRESENT(pl,i) && m->phys_page == pl[i].phys_addr) {
8820                                 if ((upl->flags & UPL_DECMP_REQ) && upl->decmp_io_upl) {
8821                                         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_EXPEDITE)) | DBG_FUNC_NONE, upl->upl_creator, m, upl, upl->upl_priority, 0);
8822                                         vm_decmp_upl_reprioritize(upl, cur_tier);
8823                                         break;
8824                                 }
8825                                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_EXPEDITE)) | DBG_FUNC_NONE, upl->upl_creator, m, upl->upl_reprio_info[i], upl->upl_priority, 0);
8826                                 if (UPL_REPRIO_INFO_BLKNO(upl, i) != 0 && UPL_REPRIO_INFO_LEN(upl, i) != 0)
8827                                         vm_page_request_reprioritize(o, UPL_REPRIO_INFO_BLKNO(upl, i), UPL_REPRIO_INFO_LEN(upl, i), cur_tier);
8828                                 break;
8829                          }
8830                  }
8831                  /* Check if we found any hits */
8832                  if (i != num_pages)
8833                         break;
8834         }
8835
8836         return;
8837 }
8838
8839 wait_result_t
8840 vm_page_sleep(vm_object_t o, vm_page_t m, int interruptible)
8841 {
8842         wait_result_t ret;
8843
8844         KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_START, o, m, 0, 0, 0);
8845
8846         if (o->io_tracking && ((m->busy == TRUE) || (m->cleaning == TRUE) || VM_PAGE_WIRED(m))) {
8847                 /*
8848                 Indicates page is busy due to an I/O. Issue a reprioritize request if necessary.
8849                 */
8850                 vm_page_handle_prio_inversion(o,m);
8851         }
8852         m->wanted = TRUE;
8853         ret = thread_sleep_vm_object(o, m, interruptible);
8854         KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_END, o, m, 0, 0, 0);
8855         return ret;
8856 }
8857
8858 static void
8859 io_reprioritize_thread(void *param __unused, wait_result_t wr __unused)
8860 {
8861         io_reprioritize_req_t   req = NULL;
8862
8863         while(1) {
8864
8865                 IO_REPRIORITIZE_LIST_LOCK();
8866                 if (queue_empty(&io_reprioritize_list)) {
8867                         IO_REPRIORITIZE_LIST_UNLOCK();
8868                         break;
8869                 }
8870
8871                 queue_remove_first(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list);
8872                 IO_REPRIORITIZE_LIST_UNLOCK();
8873
8874                 vnode_pager_issue_reprioritize_io(req->devvp, req->blkno, req->len, req->priority);
8875                 zfree(io_reprioritize_req_zone, req);
8876         }
8877
8878         IO_REPRIO_THREAD_CONTINUATION();
8879 }
8880 #endif