X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/6601e61aa18bf4f09af135ff61fc7f4771d23b06..cb3231590a3c94ab4375e2228bd5e86b0cf1ad7e:/osfmk/vm/vm_object.c diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index 3f63de7d7..5b6250afc 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -1,49 +1,55 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -/* +/* * Mach Operating System * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University * All Rights Reserved. - * + * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. - * + * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * + * * Carnegie Mellon requests users of this software to return to - * + * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 - * + * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ @@ -56,6 +62,7 @@ * Virtual memory object module. */ +#include #include #include @@ -65,27 +72,70 @@ #include #include +#include + #include #include #include #include -#include #include -#include +#include #include #include #include #include #include +#include #include +#include #include #include #include #include #include #include +#include + +#include + +#if CONFIG_PHANTOM_CACHE +#include +#endif + +#if VM_OBJECT_ACCESS_TRACKING +uint64_t vm_object_access_tracking_reads = 0; +uint64_t vm_object_access_tracking_writes = 0; +#endif /* VM_OBJECT_ACCESS_TRACKING */ + +boolean_t vm_object_collapse_compressor_allowed = TRUE; + +struct vm_counters vm_counters; + +#if VM_OBJECT_TRACKING +boolean_t vm_object_tracking_inited = FALSE; +btlog_t *vm_object_tracking_btlog; + +void +vm_object_tracking_init(void) +{ + int vm_object_tracking; + + vm_object_tracking = 1; + PE_parse_boot_argn("vm_object_tracking", &vm_object_tracking, + sizeof(vm_object_tracking)); + + if (vm_object_tracking) { + vm_object_tracking_btlog = btlog_create( + VM_OBJECT_TRACKING_NUM_RECORDS, + VM_OBJECT_TRACKING_BTDEPTH, + TRUE /* caller_will_remove_entries_for_element? */); + assert(vm_object_tracking_btlog); + vm_object_tracking_inited = TRUE; + } +} +#endif /* VM_OBJECT_TRACKING */ /* * Virtual memory objects maintain the actual data @@ -136,7 +186,7 @@ * that depend on the default memory manager are called * "internal". The "pager_created" field is provided to * indicate whether these ports have ever been allocated. - * + * * The kernel may also create virtual memory objects to * hold changed pages after a copy-on-write operation. * In this case, the virtual memory object (and its @@ -161,43 +211,37 @@ */ /* Forward declarations for internal functions. */ -static kern_return_t vm_object_terminate( - vm_object_t object); +static kern_return_t vm_object_terminate( + vm_object_t object); -extern void vm_object_remove( - vm_object_t object); +static kern_return_t vm_object_copy_call( + vm_object_t src_object, + vm_object_offset_t src_offset, + vm_object_size_t size, + vm_object_t *_result_object); -static vm_object_t vm_object_cache_trim( - boolean_t called_from_vm_object_deallocate); +static void vm_object_do_collapse( + vm_object_t object, + vm_object_t backing_object); -static void vm_object_deactivate_all_pages( - vm_object_t object); +static void vm_object_do_bypass( + vm_object_t object, + vm_object_t backing_object); -static kern_return_t vm_object_copy_call( - vm_object_t src_object, - vm_object_offset_t src_offset, - vm_object_size_t size, - vm_object_t *_result_object); +static void vm_object_release_pager( + memory_object_t pager); -static void vm_object_do_collapse( - vm_object_t object, - vm_object_t backing_object); - -static void vm_object_do_bypass( - vm_object_t object, - vm_object_t backing_object); - -static void vm_object_release_pager( - memory_object_t pager); - -static zone_t vm_object_zone; /* vm backing store zone */ +zone_t vm_object_zone; /* vm backing store zone */ /* * All wired-down kernel memory belongs to a single virtual * memory object (kernel_object) to avoid wasting data structures. */ -static struct vm_object kernel_object_store; -__private_extern__ vm_object_t kernel_object = &kernel_object_store; +static struct vm_object kernel_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); +vm_object_t kernel_object; + +static struct vm_object compressor_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); +vm_object_t compressor_object = &compressor_object_store; /* * The submap object is used as a placeholder for vm_map_submap @@ -205,7 +249,7 @@ __private_extern__ vm_object_t kernel_object = &kernel_object_store; * is exported by the vm_map module. The storage is declared * here because it must be initialized here. */ -static struct vm_object vm_submap_object_store; +static struct vm_object vm_submap_object_store __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); /* * Virtual memory objects are initialized from @@ -215,146 +259,87 @@ static struct vm_object vm_submap_object_store; * object structure, be sure to add initialization * (see _vm_object_allocate()). */ -static struct vm_object vm_object_template; - -/* - * Virtual memory objects that are not referenced by - * any address maps, but that are allowed to persist - * (an attribute specified by the associated memory manager), - * are kept in a queue (vm_object_cached_list). - * - * When an object from this queue is referenced again, - * for example to make another address space mapping, - * it must be removed from the queue. That is, the - * queue contains *only* objects with zero references. - * - * The kernel may choose to terminate objects from this - * queue in order to reclaim storage. The current policy - * is to permit a fixed maximum number of unreferenced - * objects (vm_object_cached_max). - * - * A spin lock (accessed by routines - * vm_object_cache_{lock,lock_try,unlock}) governs the - * object cache. It must be held when objects are - * added to or removed from the cache (in vm_object_terminate). - * The routines that acquire a reference to a virtual - * memory object based on one of the memory object ports - * must also lock the cache. - * - * Ideally, the object cache should be more isolated - * from the reference mechanism, so that the lock need - * not be held to make simple references. - */ -static queue_head_t vm_object_cached_list; -static int vm_object_cached_count=0; -static int vm_object_cached_high; /* highest # cached objects */ -static int vm_object_cached_max = 512; /* may be patched*/ - -static decl_mutex_data(,vm_object_cached_lock_data) - -#define vm_object_cache_lock() \ - mutex_lock(&vm_object_cached_lock_data) -#define vm_object_cache_lock_try() \ - mutex_try(&vm_object_cached_lock_data) -#define vm_object_cache_unlock() \ - mutex_unlock(&vm_object_cached_lock_data) - -#define VM_OBJECT_HASH_COUNT 1024 -static queue_head_t vm_object_hashtable[VM_OBJECT_HASH_COUNT]; -static struct zone *vm_object_hash_zone; - -struct vm_object_hash_entry { - queue_chain_t hash_link; /* hash chain link */ - memory_object_t pager; /* pager we represent */ - vm_object_t object; /* corresponding object */ - boolean_t waiting; /* someone waiting for - * termination */ -}; - -typedef struct vm_object_hash_entry *vm_object_hash_entry_t; -#define VM_OBJECT_HASH_ENTRY_NULL ((vm_object_hash_entry_t) 0) - -#define VM_OBJECT_HASH_SHIFT 8 -#define vm_object_hash(pager) \ - ((((unsigned)pager) >> VM_OBJECT_HASH_SHIFT) % VM_OBJECT_HASH_COUNT) - -void vm_object_hash_entry_free( - vm_object_hash_entry_t entry); - -static void vm_object_reap(vm_object_t object); -static void vm_object_reap_async(vm_object_t object); -static void vm_object_reaper_thread(void); -static queue_head_t vm_object_reaper_queue; /* protected by vm_object_cache_lock() */ -unsigned int vm_object_reap_count = 0; -unsigned int vm_object_reap_count_async = 0; +static struct vm_object vm_object_template; -/* - * vm_object_hash_lookup looks up a pager in the hashtable - * and returns the corresponding entry, with optional removal. - */ +unsigned int vm_page_purged_wired = 0; +unsigned int vm_page_purged_busy = 0; +unsigned int vm_page_purged_others = 0; -static vm_object_hash_entry_t -vm_object_hash_lookup( - memory_object_t pager, - boolean_t remove_entry) -{ - register queue_t bucket; - register vm_object_hash_entry_t entry; +static queue_head_t vm_object_cached_list; +static uint32_t vm_object_cache_pages_freed = 0; +static uint32_t vm_object_cache_pages_moved = 0; +static uint32_t vm_object_cache_pages_skipped = 0; +static uint32_t vm_object_cache_adds = 0; +static uint32_t vm_object_cached_count = 0; +static lck_mtx_t vm_object_cached_lock_data; +static lck_mtx_ext_t vm_object_cached_lock_data_ext; - bucket = &vm_object_hashtable[vm_object_hash(pager)]; +static uint32_t vm_object_page_grab_failed = 0; +static uint32_t vm_object_page_grab_skipped = 0; +static uint32_t vm_object_page_grab_returned = 0; +static uint32_t vm_object_page_grab_pmapped = 0; +static uint32_t vm_object_page_grab_reactivations = 0; - entry = (vm_object_hash_entry_t)queue_first(bucket); - while (!queue_end(bucket, (queue_entry_t)entry)) { - if (entry->pager == pager && !remove_entry) - return(entry); - else if (entry->pager == pager) { - queue_remove(bucket, entry, - vm_object_hash_entry_t, hash_link); - return(entry); - } +#define vm_object_cache_lock_spin() \ + lck_mtx_lock_spin(&vm_object_cached_lock_data) +#define vm_object_cache_unlock() \ + lck_mtx_unlock(&vm_object_cached_lock_data) - entry = (vm_object_hash_entry_t)queue_next(&entry->hash_link); - } +static void vm_object_cache_remove_locked(vm_object_t); - return(VM_OBJECT_HASH_ENTRY_NULL); -} -/* - * vm_object_hash_enter enters the specified - * pager / cache object association in the hashtable. - */ +static void vm_object_reap(vm_object_t object); +static void vm_object_reap_async(vm_object_t object); +static void vm_object_reaper_thread(void); -static void -vm_object_hash_insert( - vm_object_hash_entry_t entry) -{ - register queue_t bucket; +static lck_mtx_t vm_object_reaper_lock_data; +static lck_mtx_ext_t vm_object_reaper_lock_data_ext; - bucket = &vm_object_hashtable[vm_object_hash(entry->pager)]; +static queue_head_t vm_object_reaper_queue; /* protected by vm_object_reaper_lock() */ +unsigned int vm_object_reap_count = 0; +unsigned int vm_object_reap_count_async = 0; - queue_enter(bucket, entry, vm_object_hash_entry_t, hash_link); +#define vm_object_reaper_lock() \ + lck_mtx_lock(&vm_object_reaper_lock_data) +#define vm_object_reaper_lock_spin() \ + lck_mtx_lock_spin(&vm_object_reaper_lock_data) +#define vm_object_reaper_unlock() \ + lck_mtx_unlock(&vm_object_reaper_lock_data) + +#if CONFIG_IOSCHED +/* I/O Re-prioritization request list */ +queue_head_t io_reprioritize_list; +lck_spin_t io_reprioritize_list_lock; + +#define IO_REPRIORITIZE_LIST_LOCK() \ + lck_spin_lock_grp(&io_reprioritize_list_lock, &vm_object_lck_grp) +#define IO_REPRIORITIZE_LIST_UNLOCK() \ + lck_spin_unlock(&io_reprioritize_list_lock) + +#define MAX_IO_REPRIORITIZE_REQS 8192 +zone_t io_reprioritize_req_zone; + +/* I/O Re-prioritization thread */ +int io_reprioritize_wakeup = 0; +static void io_reprioritize_thread(void *param __unused, wait_result_t wr __unused); + +#define IO_REPRIO_THREAD_WAKEUP() thread_wakeup((event_t)&io_reprioritize_wakeup) +#define IO_REPRIO_THREAD_CONTINUATION() \ +{ \ + assert_wait(&io_reprioritize_wakeup, THREAD_UNINT); \ + thread_block(io_reprioritize_thread); \ } -static vm_object_hash_entry_t -vm_object_hash_entry_alloc( - memory_object_t pager) -{ - vm_object_hash_entry_t entry; - - entry = (vm_object_hash_entry_t)zalloc(vm_object_hash_zone); - entry->pager = pager; - entry->object = VM_OBJECT_NULL; - entry->waiting = FALSE; +void vm_page_request_reprioritize(vm_object_t, uint64_t, uint32_t, int); +void vm_page_handle_prio_inversion(vm_object_t, vm_page_t); +void vm_decmp_upl_reprioritize(upl_t, int); +#endif - return(entry); -} +#if 0 +#undef KERNEL_DEBUG +#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT +#endif -void -vm_object_hash_entry_free( - vm_object_hash_entry_t entry) -{ - zfree(vm_object_hash_zone, entry); -} /* * vm_object_allocate: @@ -364,39 +349,61 @@ vm_object_hash_entry_free( __private_extern__ void _vm_object_allocate( - vm_object_size_t size, - vm_object_t object) + vm_object_size_t size, + vm_object_t object) { - XPR(XPR_VM_OBJECT, - "vm_object_allocate, object 0x%X size 0x%X\n", - (integer_t)object, size, 0,0,0); - *object = vm_object_template; - queue_init(&object->memq); - queue_init(&object->msr_q); -#ifdef UPL_DEBUG + vm_page_queue_init(&object->memq); +#if UPL_DEBUG || CONFIG_IOSCHED queue_init(&object->uplq); -#endif /* UPL_DEBUG */ +#endif vm_object_lock_init(object); - object->size = size; + object->vo_size = size; + +#if VM_OBJECT_TRACKING_OP_CREATED + if (vm_object_tracking_inited) { + void *bt[VM_OBJECT_TRACKING_BTDEPTH]; + int numsaved = 0; + + numsaved = OSBacktrace(bt, VM_OBJECT_TRACKING_BTDEPTH); + btlog_add_entry(vm_object_tracking_btlog, + object, + VM_OBJECT_TRACKING_OP_CREATED, + bt, + numsaved); + } +#endif /* VM_OBJECT_TRACKING_OP_CREATED */ } __private_extern__ vm_object_t vm_object_allocate( - vm_object_size_t size) + vm_object_size_t size) { - register vm_object_t object; + vm_object_t object; object = (vm_object_t) zalloc(vm_object_zone); - + // dbgLog(object, size, 0, 2); /* (TEST/DEBUG) */ - if (object != VM_OBJECT_NULL) + if (object != VM_OBJECT_NULL) { _vm_object_allocate(size, object); + } return object; } + +lck_grp_t vm_object_lck_grp; +lck_grp_t vm_object_cache_lck_grp; +lck_grp_attr_t vm_object_lck_grp_attr; +lck_attr_t vm_object_lck_attr; +lck_attr_t kernel_object_lck_attr; +lck_attr_t compressor_object_lck_attr; + +extern void vm_named_entry_init(void); + +int workaround_41447923 = 0; + /* * vm_object_bootstrap: * @@ -405,53 +412,79 @@ vm_object_allocate( __private_extern__ void vm_object_bootstrap(void) { - register int i; + vm_size_t vm_object_size; - vm_object_zone = zinit((vm_size_t) sizeof(struct vm_object), - round_page_32(512*1024), - round_page_32(12*1024), - "vm objects"); + assert(sizeof(mo_ipc_object_bits_t) == sizeof(ipc_object_bits_t)); - queue_init(&vm_object_reaper_queue); + vm_object_size = (sizeof(struct vm_object) + (VM_PACKED_POINTER_ALIGNMENT - 1)) & ~(VM_PACKED_POINTER_ALIGNMENT - 1); + + vm_object_zone = zinit(vm_object_size, + round_page(512 * 1024), + round_page(12 * 1024), + "vm objects"); + zone_change(vm_object_zone, Z_CALLERACCT, FALSE); /* don't charge caller */ + zone_change(vm_object_zone, Z_NOENCRYPT, TRUE); + zone_change(vm_object_zone, Z_ALIGNMENT_REQUIRED, TRUE); + + vm_object_init_lck_grp(); queue_init(&vm_object_cached_list); - mutex_init(&vm_object_cached_lock_data, 0); - vm_object_hash_zone = - zinit((vm_size_t) sizeof (struct vm_object_hash_entry), - round_page_32(512*1024), - round_page_32(12*1024), - "vm object hash entries"); + lck_mtx_init_ext(&vm_object_cached_lock_data, + &vm_object_cached_lock_data_ext, + &vm_object_cache_lck_grp, + &vm_object_lck_attr); + + queue_init(&vm_object_reaper_queue); + + lck_mtx_init_ext(&vm_object_reaper_lock_data, + &vm_object_reaper_lock_data_ext, + &vm_object_lck_grp, + &vm_object_lck_attr); - for (i = 0; i < VM_OBJECT_HASH_COUNT; i++) - queue_init(&vm_object_hashtable[i]); /* * Fill in a template object, for quick initialization */ /* memq; Lock; init after allocation */ - vm_object_template.size = 0; + + vm_object_template.memq.prev = 0; + vm_object_template.memq.next = 0; +#if 0 + /* + * We can't call vm_object_lock_init() here because that will + * allocate some memory and VM is not fully initialized yet. + * The lock will be initialized for each allocated object in + * _vm_object_allocate(), so we don't need to initialize it in + * the vm_object_template. + */ + vm_object_lock_init(&vm_object_template); +#endif +#if DEVELOPMENT || DEBUG + vm_object_template.Lock_owner = 0; +#endif + vm_object_template.vo_size = 0; vm_object_template.memq_hint = VM_PAGE_NULL; vm_object_template.ref_count = 1; -#if TASK_SWAPPER +#if TASK_SWAPPER vm_object_template.res_count = 1; -#endif /* TASK_SWAPPER */ +#endif /* TASK_SWAPPER */ vm_object_template.resident_page_count = 0; + vm_object_template.wired_page_count = 0; + vm_object_template.reusable_page_count = 0; vm_object_template.copy = VM_OBJECT_NULL; vm_object_template.shadow = VM_OBJECT_NULL; - vm_object_template.shadow_offset = (vm_object_offset_t) 0; - vm_object_template.cow_hint = ~(vm_offset_t)0; - vm_object_template.true_share = FALSE; - + vm_object_template.vo_shadow_offset = (vm_object_offset_t) 0; vm_object_template.pager = MEMORY_OBJECT_NULL; vm_object_template.paging_offset = 0; vm_object_template.pager_control = MEMORY_OBJECT_CONTROL_NULL; - /* msr_q; init after allocation */ - vm_object_template.copy_strategy = MEMORY_OBJECT_COPY_SYMMETRIC; - vm_object_template.absent_count = 0; vm_object_template.paging_in_progress = 0; +#if __LP64__ + vm_object_template.__object1_unused_bits = 0; +#endif /* __LP64__ */ + vm_object_template.activity_in_progress = 0; /* Begin bitfields */ vm_object_template.all_wanted = 0; /* all bits FALSE */ @@ -461,32 +494,95 @@ vm_object_bootstrap(void) vm_object_template.pager_trusted = FALSE; vm_object_template.can_persist = FALSE; vm_object_template.internal = TRUE; - vm_object_template.temporary = TRUE; vm_object_template.private = FALSE; vm_object_template.pageout = FALSE; vm_object_template.alive = TRUE; - vm_object_template.purgable = VM_OBJECT_NONPURGABLE; - vm_object_template.silent_overwrite = FALSE; - vm_object_template.advisory_pageout = FALSE; + vm_object_template.purgable = VM_PURGABLE_DENY; + vm_object_template.purgeable_when_ripe = FALSE; + vm_object_template.purgeable_only_by_kernel = FALSE; vm_object_template.shadowed = FALSE; + vm_object_template.true_share = FALSE; vm_object_template.terminating = FALSE; + vm_object_template.named = FALSE; vm_object_template.shadow_severed = FALSE; vm_object_template.phys_contiguous = FALSE; vm_object_template.nophyscache = FALSE; /* End bitfields */ - /* cache bitfields */ - vm_object_template.wimg_bits = VM_WIMG_DEFAULT; + vm_object_template.cached_list.prev = NULL; + vm_object_template.cached_list.next = NULL; - /* cached_list; init after allocation */ vm_object_template.last_alloc = (vm_object_offset_t) 0; - vm_object_template.cluster_size = 0; -#if MACH_PAGEMAP - vm_object_template.existence_map = VM_EXTERNAL_NULL; -#endif /* MACH_PAGEMAP */ -#if MACH_ASSERT - vm_object_template.paging_object = VM_OBJECT_NULL; -#endif /* MACH_ASSERT */ + vm_object_template.sequential = (vm_object_offset_t) 0; + vm_object_template.pages_created = 0; + vm_object_template.pages_used = 0; + vm_object_template.scan_collisions = 0; +#if CONFIG_PHANTOM_CACHE + vm_object_template.phantom_object_id = 0; +#endif + vm_object_template.cow_hint = ~(vm_offset_t)0; + + /* cache bitfields */ + vm_object_template.wimg_bits = VM_WIMG_USE_DEFAULT; + vm_object_template.set_cache_attr = FALSE; + vm_object_template.object_is_shared_cache = FALSE; + vm_object_template.code_signed = FALSE; + vm_object_template.transposed = FALSE; + vm_object_template.mapping_in_progress = FALSE; + vm_object_template.phantom_isssd = FALSE; + vm_object_template.volatile_empty = FALSE; + vm_object_template.volatile_fault = FALSE; + vm_object_template.all_reusable = FALSE; + vm_object_template.blocked_access = FALSE; + vm_object_template.vo_ledger_tag = VM_LEDGER_TAG_NONE; + vm_object_template.vo_no_footprint = FALSE; +#if CONFIG_IOSCHED || UPL_DEBUG + vm_object_template.uplq.prev = NULL; + vm_object_template.uplq.next = NULL; +#endif /* UPL_DEBUG */ +#ifdef VM_PIP_DEBUG + bzero(&vm_object_template.pip_holders, + sizeof(vm_object_template.pip_holders)); +#endif /* VM_PIP_DEBUG */ + + vm_object_template.objq.next = NULL; + vm_object_template.objq.prev = NULL; + vm_object_template.task_objq.next = NULL; + vm_object_template.task_objq.prev = NULL; + + vm_object_template.purgeable_queue_type = PURGEABLE_Q_TYPE_MAX; + vm_object_template.purgeable_queue_group = 0; + + vm_object_template.vo_cache_ts = 0; + + vm_object_template.wire_tag = VM_KERN_MEMORY_NONE; +#if !VM_TAG_ACTIVE_UPDATE + vm_object_template.wired_objq.next = NULL; + vm_object_template.wired_objq.prev = NULL; +#endif /* ! VM_TAG_ACTIVE_UPDATE */ + + vm_object_template.io_tracking = FALSE; + +#if CONFIG_SECLUDED_MEMORY + vm_object_template.eligible_for_secluded = FALSE; + vm_object_template.can_grab_secluded = FALSE; +#else /* CONFIG_SECLUDED_MEMORY */ + vm_object_template.__object3_unused_bits = 0; +#endif /* CONFIG_SECLUDED_MEMORY */ + +#if VM_OBJECT_ACCESS_TRACKING + vm_object_template.access_tracking = FALSE; + vm_object_template.access_tracking_reads = 0; + vm_object_template.access_tracking_writes = 0; +#endif /* VM_OBJECT_ACCESS_TRACKING */ + +#if DEBUG + bzero(&vm_object_template.purgeable_owner_bt[0], + sizeof(vm_object_template.purgeable_owner_bt)); + vm_object_template.vo_purgeable_volatilizer = NULL; + bzero(&vm_object_template.purgeable_volatilizer_bt[0], + sizeof(vm_object_template.purgeable_volatilizer_bt)); +#endif /* DEBUG */ /* * Initialize the "kernel object" @@ -495,18 +591,18 @@ vm_object_bootstrap(void) kernel_object = &kernel_object_store; /* - * Note that in the following size specifications, we need to add 1 because + * Note that in the following size specifications, we need to add 1 because * VM_MAX_KERNEL_ADDRESS (vm_last_addr) is a maximum address, not a size. */ -#ifdef ppc - _vm_object_allocate((vm_last_addr - VM_MIN_KERNEL_ADDRESS) + 1, - kernel_object); -#else - _vm_object_allocate((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) + 1, - kernel_object); -#endif + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, + kernel_object); + + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, + compressor_object); kernel_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; + compressor_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; + kernel_object->no_tag_update = TRUE; /* * Initialize the "submap object". Make it as large as the @@ -514,13 +610,8 @@ vm_object_bootstrap(void) */ vm_submap_object = &vm_submap_object_store; -#ifdef ppc - _vm_object_allocate((vm_last_addr - VM_MIN_KERNEL_ADDRESS) + 1, - vm_submap_object); -#else - _vm_object_allocate((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) + 1, - vm_submap_object); -#endif + _vm_object_allocate(VM_MAX_KERNEL_ADDRESS + 1, + vm_submap_object); vm_submap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; /* @@ -530,25 +621,53 @@ vm_object_bootstrap(void) */ vm_object_reference(vm_submap_object); -#if MACH_PAGEMAP - vm_external_module_initialize(); -#endif /* MACH_PAGEMAP */ + vm_named_entry_init(); + + PE_parse_boot_argn("workaround_41447923", &workaround_41447923, + sizeof(workaround_41447923)); +} + +#if CONFIG_IOSCHED +void +vm_io_reprioritize_init(void) +{ + kern_return_t result; + thread_t thread = THREAD_NULL; + + /* Initialze the I/O reprioritization subsystem */ + lck_spin_init(&io_reprioritize_list_lock, &vm_object_lck_grp, &vm_object_lck_attr); + queue_init(&io_reprioritize_list); + + io_reprioritize_req_zone = zinit(sizeof(struct io_reprioritize_req), + MAX_IO_REPRIORITIZE_REQS * sizeof(struct io_reprioritize_req), + 4096, "io_reprioritize_req"); + zone_change(io_reprioritize_req_zone, Z_COLLECT, FALSE); + + result = kernel_thread_start_priority(io_reprioritize_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread); + if (result == KERN_SUCCESS) { + thread_set_thread_name(thread, "VM_io_reprioritize_thread"); + thread_deallocate(thread); + } else { + panic("Could not create io_reprioritize_thread"); + } } +#endif void vm_object_reaper_init(void) { - kern_return_t kr; - thread_t thread; + kern_return_t kr; + thread_t thread; kr = kernel_thread_start_priority( (thread_continue_t) vm_object_reaper_thread, NULL, - BASEPRI_PREEMPT - 1, + BASEPRI_VM, &thread); if (kr != KERN_SUCCESS) { panic("failed to launch vm_object_reaper_thread kr=0x%x", kr); } + thread_set_thread_name(thread, "VM_object_reaper_thread"); thread_deallocate(thread); } @@ -560,18 +679,23 @@ vm_object_init(void) */ } -/* remove the typedef below when emergency work-around is taken out */ -typedef struct vnode_pager { - memory_object_t pager; - memory_object_t pager_handle; /* pager */ - memory_object_control_t control_handle; /* memory object's control handle */ - void *vnode_handle; /* vnode handle */ -} *vnode_pager_t; -#define MIGHT_NOT_CACHE_SHADOWS 1 -#if MIGHT_NOT_CACHE_SHADOWS -static int cache_shadows = TRUE; -#endif /* MIGHT_NOT_CACHE_SHADOWS */ +__private_extern__ void +vm_object_init_lck_grp(void) +{ + /* + * initialze the vm_object lock world + */ + lck_grp_attr_setdefault(&vm_object_lck_grp_attr); + lck_grp_init(&vm_object_lck_grp, "vm_object", &vm_object_lck_grp_attr); + lck_grp_init(&vm_object_cache_lck_grp, "vm_object_cache", &vm_object_lck_grp_attr); + lck_attr_setdefault(&vm_object_lck_attr); + lck_attr_setdefault(&kernel_object_lck_attr); + lck_attr_cleardebug(&kernel_object_lck_attr); + lck_attr_setdefault(&compressor_object_lck_attr); + lck_attr_cleardebug(&compressor_object_lck_attr); +} + /* * vm_object_deallocate: @@ -584,39 +708,121 @@ static int cache_shadows = TRUE; * * No object may be locked. */ +unsigned long vm_object_deallocate_shared_successes = 0; +unsigned long vm_object_deallocate_shared_failures = 0; +unsigned long vm_object_deallocate_shared_swap_failures = 0; + __private_extern__ void vm_object_deallocate( - register vm_object_t object) + vm_object_t object) { - boolean_t retry_cache_trim = FALSE; - vm_object_t shadow = VM_OBJECT_NULL; - + vm_object_t shadow = VM_OBJECT_NULL; + // if(object)dbgLog(object, object->ref_count, object->can_persist, 3); /* (TEST/DEBUG) */ // else dbgLog(object, 0, 0, 3); /* (TEST/DEBUG) */ + if (object == VM_OBJECT_NULL) { + return; + } - while (object != VM_OBJECT_NULL) { + if (object == kernel_object || object == compressor_object) { + vm_object_lock_shared(object); + OSAddAtomic(-1, &object->ref_count); + + if (object->ref_count == 0) { + if (object == kernel_object) { + panic("vm_object_deallocate: losing kernel_object\n"); + } else { + panic("vm_object_deallocate: losing compressor_object\n"); + } + } + vm_object_unlock(object); + return; + } + + if (object->ref_count == 2 && + object->named) { + /* + * This "named" object's reference count is about to + * drop from 2 to 1: + * we'll need to call memory_object_last_unmap(). + */ + } else if (object->ref_count == 2 && + object->internal && + object->shadow != VM_OBJECT_NULL) { + /* + * This internal object's reference count is about to + * drop from 2 to 1 and it has a shadow object: + * we'll want to try and collapse this object with its + * shadow. + */ + } else if (object->ref_count >= 2) { + UInt32 original_ref_count; + volatile UInt32 *ref_count_p; + Boolean atomic_swap; + + /* + * The object currently looks like it is not being + * kept alive solely by the reference we're about to release. + * Let's try and release our reference without taking + * all the locks we would need if we had to terminate the + * object (cache lock + exclusive object lock). + * Lock the object "shared" to make sure we don't race with + * anyone holding it "exclusive". + */ + vm_object_lock_shared(object); + ref_count_p = (volatile UInt32 *) &object->ref_count; + original_ref_count = object->ref_count; /* - * The cache holds a reference (uncounted) to - * the object; we must lock it before removing - * the object. + * Test again as "ref_count" could have changed. + * "named" shouldn't change. */ - for (;;) { - vm_object_cache_lock(); + if (original_ref_count == 2 && + object->named) { + /* need to take slow path for m_o_last_unmap() */ + atomic_swap = FALSE; + } else if (original_ref_count == 2 && + object->internal && + object->shadow != VM_OBJECT_NULL) { + /* need to take slow path for vm_object_collapse() */ + atomic_swap = FALSE; + } else if (original_ref_count < 2) { + /* need to take slow path for vm_object_terminate() */ + atomic_swap = FALSE; + } else { + /* try an atomic update with the shared lock */ + atomic_swap = OSCompareAndSwap( + original_ref_count, + original_ref_count - 1, + (UInt32 *) &object->ref_count); + if (atomic_swap == FALSE) { + vm_object_deallocate_shared_swap_failures++; + /* fall back to the slow path... */ + } + } + + vm_object_unlock(object); + if (atomic_swap) { /* - * if we try to take a regular lock here - * we risk deadlocking against someone - * holding a lock on this object while - * trying to vm_object_deallocate a different - * object + * ref_count was updated atomically ! */ - if (vm_object_lock_try(object)) - break; - vm_object_cache_unlock(); - mutex_pause(); /* wait a bit */ + vm_object_deallocate_shared_successes++; + return; } + + /* + * Someone else updated the ref_count at the same + * time and we lost the race. Fall back to the usual + * slow but safe path... + */ + vm_object_deallocate_shared_failures++; + } + + while (object != VM_OBJECT_NULL) { + vm_object_lock(object); + assert(object->ref_count > 0); /* @@ -624,35 +830,23 @@ vm_object_deallocate( * that reference would remain, inform the pager * about the last "mapping" reference going away. */ - if ((object->ref_count == 2) && (object->named)) { - memory_object_t pager = object->pager; + if ((object->ref_count == 2) && (object->named)) { + memory_object_t pager = object->pager; /* Notify the Pager that there are no */ /* more mappers for this object */ if (pager != MEMORY_OBJECT_NULL) { + vm_object_mapping_wait(object, THREAD_UNINT); + vm_object_mapping_begin(object); vm_object_unlock(object); - vm_object_cache_unlock(); - - memory_object_unmap(pager); - for (;;) { - vm_object_cache_lock(); + memory_object_last_unmap(pager); - /* - * if we try to take a regular lock here - * we risk deadlocking against someone - * holding a lock on this object while - * trying to vm_object_deallocate a different - * object - */ - if (vm_object_lock_try(object)) - break; - vm_object_cache_unlock(); - mutex_pause(); /* wait a bit */ - } - assert(object->ref_count > 0); + vm_object_lock(object); + vm_object_mapping_end(object); } + assert(object->ref_count > 0); } /* @@ -668,30 +862,31 @@ vm_object_deallocate( /* terminate again. */ if ((object->ref_count > 1) || object->terminating) { + vm_object_lock_assert_exclusive(object); object->ref_count--; vm_object_res_deallocate(object); - vm_object_cache_unlock(); if (object->ref_count == 1 && object->shadow != VM_OBJECT_NULL) { /* - * We don't use this VM object anymore. We - * would like to collapse it into its parent(s), - * but we don't have any pointers back to these - * parent object(s). + * There's only one reference left on this + * VM object. We can't tell if it's a valid + * one (from a mapping for example) or if this + * object is just part of a possibly stale and + * useless shadow chain. + * We would like to try and collapse it into + * its parent, but we don't have any pointers + * back to this parent object. * But we can try and collapse this object with * its own shadows, in case these are useless * too... + * We can't bypass this object though, since we + * don't know if this last reference on it is + * meaningful or not. */ - vm_object_collapse(object, 0); - } - - vm_object_unlock(object); - if (retry_cache_trim && - ((object = vm_object_cache_trim(TRUE)) != - VM_OBJECT_NULL)) { - continue; + vm_object_collapse(object, 0, FALSE); } + vm_object_unlock(object); return; } @@ -699,230 +894,415 @@ vm_object_deallocate( * We have to wait for initialization * before destroying or caching the object. */ - - if (object->pager_created && ! object->pager_initialized) { - assert(! object->can_persist); + + if (object->pager_created && !object->pager_initialized) { + assert(!object->can_persist); vm_object_assert_wait(object, - VM_OBJECT_EVENT_INITIALIZED, - THREAD_UNINT); + VM_OBJECT_EVENT_INITIALIZED, + THREAD_UNINT); vm_object_unlock(object); - vm_object_cache_unlock(); + thread_block(THREAD_CONTINUE_NULL); continue; } + VM_OBJ_RES_DECR(object); /* XXX ? */ /* - * If this object can persist, then enter it in - * the cache. Otherwise, terminate it. - * - * NOTE: Only permanent objects are cached, and - * permanent objects cannot have shadows. This - * affects the residence counting logic in a minor - * way (can do it in-line, mostly). + * Terminate this object. If it had a shadow, + * then deallocate it; otherwise, if we need + * to retry a cache trim, do so now; otherwise, + * we are done. "pageout" objects have a shadow, + * but maintain a "paging reference" rather than + * a normal reference. */ + shadow = object->pageout?VM_OBJECT_NULL:object->shadow; - if ((object->can_persist) && (object->alive)) { - /* - * Now it is safe to decrement reference count, - * and to return if reference count is > 0. - */ - if (--object->ref_count > 0) { - vm_object_res_deallocate(object); - vm_object_unlock(object); - vm_object_cache_unlock(); - if (retry_cache_trim && - ((object = vm_object_cache_trim(TRUE)) != - VM_OBJECT_NULL)) { - continue; - } - return; - } + if (vm_object_terminate(object) != KERN_SUCCESS) { + return; + } + if (shadow != VM_OBJECT_NULL) { + object = shadow; + continue; + } + return; + } +} -#if MIGHT_NOT_CACHE_SHADOWS - /* - * Remove shadow now if we don't - * want to cache shadows. - */ - if (! cache_shadows) { - shadow = object->shadow; - object->shadow = VM_OBJECT_NULL; - } -#endif /* MIGHT_NOT_CACHE_SHADOWS */ - /* - * Enter the object onto the queue of - * cached objects, and deactivate - * all of its pages. - */ - assert(object->shadow == VM_OBJECT_NULL); - VM_OBJ_RES_DECR(object); - XPR(XPR_VM_OBJECT, - "vm_o_deallocate: adding %x to cache, queue = (%x, %x)\n", - (integer_t)object, - (integer_t)vm_object_cached_list.next, - (integer_t)vm_object_cached_list.prev,0,0); - - vm_object_cached_count++; - if (vm_object_cached_count > vm_object_cached_high) - vm_object_cached_high = vm_object_cached_count; - queue_enter(&vm_object_cached_list, object, - vm_object_t, cached_list); - vm_object_cache_unlock(); - vm_object_deactivate_all_pages(object); - vm_object_unlock(object); -#if MIGHT_NOT_CACHE_SHADOWS - /* - * If we have a shadow that we need - * to deallocate, do so now, remembering - * to trim the cache later. - */ - if (! cache_shadows && shadow != VM_OBJECT_NULL) { - object = shadow; - retry_cache_trim = TRUE; - continue; - } -#endif /* MIGHT_NOT_CACHE_SHADOWS */ +vm_page_t +vm_object_page_grab( + vm_object_t object) +{ + vm_page_t p, next_p; + int p_limit = 0; + int p_skipped = 0; - /* - * Trim the cache. If the cache trim - * returns with a shadow for us to deallocate, - * then remember to retry the cache trim - * when we are done deallocating the shadow. - * Otherwise, we are done. - */ + vm_object_lock_assert_exclusive(object); - object = vm_object_cache_trim(TRUE); - if (object == VM_OBJECT_NULL) { - return; - } - retry_cache_trim = TRUE; + next_p = (vm_page_t)vm_page_queue_first(&object->memq); + p_limit = MIN(50, object->resident_page_count); - } else { - /* - * This object is not cachable; terminate it. - */ - XPR(XPR_VM_OBJECT, - "vm_o_deallocate: !cacheable 0x%X res %d paging_ops %d thread 0x%p ref %d\n", - (integer_t)object, object->resident_page_count, - object->paging_in_progress, - (void *)current_thread(),object->ref_count); + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next_p) && --p_limit > 0) { + p = next_p; + next_p = (vm_page_t)vm_page_queue_next(&next_p->vmp_listq); - VM_OBJ_RES_DECR(object); /* XXX ? */ - /* - * Terminate this object. If it had a shadow, - * then deallocate it; otherwise, if we need - * to retry a cache trim, do so now; otherwise, - * we are done. "pageout" objects have a shadow, - * but maintain a "paging reference" rather than - * a normal reference. - */ - shadow = object->pageout?VM_OBJECT_NULL:object->shadow; - if(vm_object_terminate(object) != KERN_SUCCESS) { - return; - } - if (shadow != VM_OBJECT_NULL) { - object = shadow; - continue; - } - if (retry_cache_trim && - ((object = vm_object_cache_trim(TRUE)) != - VM_OBJECT_NULL)) { - continue; - } - return; + if (VM_PAGE_WIRED(p) || p->vmp_busy || p->vmp_cleaning || p->vmp_laundry || p->vmp_fictitious) { + goto move_page_in_obj; } - } - assert(! retry_cache_trim); -} -/* - * Check to see whether we really need to trim - * down the cache. If so, remove an object from - * the cache, terminate it, and repeat. - * - * Called with, and returns with, cache lock unlocked. - */ -vm_object_t -vm_object_cache_trim( - boolean_t called_from_vm_object_deallocate) -{ - register vm_object_t object = VM_OBJECT_NULL; - vm_object_t shadow; + if (p->vmp_pmapped || p->vmp_dirty || p->vmp_precious) { + vm_page_lockspin_queues(); - for (;;) { + if (p->vmp_pmapped) { + int refmod_state; - /* - * If we no longer need to trim the cache, - * then we are done. - */ + vm_object_page_grab_pmapped++; - vm_object_cache_lock(); - if (vm_object_cached_count <= vm_object_cached_max) { - vm_object_cache_unlock(); - return VM_OBJECT_NULL; - } + if (p->vmp_reference == FALSE || p->vmp_dirty == FALSE) { + refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(p)); - /* - * We must trim down the cache, so remove - * the first object in the cache. - */ - XPR(XPR_VM_OBJECT, - "vm_object_cache_trim: removing from front of cache (%x, %x)\n", - (integer_t)vm_object_cached_list.next, - (integer_t)vm_object_cached_list.prev, 0, 0, 0); - - object = (vm_object_t) queue_first(&vm_object_cached_list); - if(object == (vm_object_t) &vm_object_cached_list) { - /* something's wrong with the calling parameter or */ - /* the value of vm_object_cached_count, just fix */ - /* and return */ - if(vm_object_cached_max < 0) - vm_object_cached_max = 0; - vm_object_cached_count = 0; - vm_object_cache_unlock(); - return VM_OBJECT_NULL; - } - vm_object_lock(object); - queue_remove(&vm_object_cached_list, object, vm_object_t, - cached_list); - vm_object_cached_count--; + if (refmod_state & VM_MEM_REFERENCED) { + p->vmp_reference = TRUE; + } + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(p, FALSE); + } + } + if (p->vmp_dirty == FALSE && p->vmp_precious == FALSE) { + refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); - /* - * Since this object is in the cache, we know - * that it is initialized and has no references. - * Take a reference to avoid recursive deallocations. - */ + if (refmod_state & VM_MEM_REFERENCED) { + p->vmp_reference = TRUE; + } + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(p, FALSE); + } + + if (p->vmp_dirty == FALSE) { + goto take_page; + } + } + } + if ((p->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) && p->vmp_reference == TRUE) { + vm_page_activate(p); - assert(object->pager_initialized); - assert(object->ref_count == 0); - object->ref_count++; + VM_STAT_INCR(reactivations); + vm_object_page_grab_reactivations++; + } + vm_page_unlock_queues(); +move_page_in_obj: + vm_page_queue_remove(&object->memq, p, vmp_listq); + vm_page_queue_enter(&object->memq, p, vmp_listq); - /* - * Terminate the object. - * If the object had a shadow, we let vm_object_deallocate - * deallocate it. "pageout" objects have a shadow, but - * maintain a "paging reference" rather than a normal - * reference. - * (We are careful here to limit recursion.) - */ - shadow = object->pageout?VM_OBJECT_NULL:object->shadow; - if(vm_object_terminate(object) != KERN_SUCCESS) + p_skipped++; continue; - if (shadow != VM_OBJECT_NULL) { - if (called_from_vm_object_deallocate) { - return shadow; - } else { - vm_object_deallocate(shadow); - } } + vm_page_lockspin_queues(); +take_page: + vm_page_free_prepare_queues(p); + vm_object_page_grab_returned++; + vm_object_page_grab_skipped += p_skipped; + + vm_page_unlock_queues(); + + vm_page_free_prepare_object(p, TRUE); + + return p; } + vm_object_page_grab_skipped += p_skipped; + vm_object_page_grab_failed++; + + return NULL; } -boolean_t vm_object_terminate_remove_all = FALSE; -/* - * Routine: vm_object_terminate + +#define EVICT_PREPARE_LIMIT 64 +#define EVICT_AGE 10 + +static clock_sec_t vm_object_cache_aging_ts = 0; + +static void +vm_object_cache_remove_locked( + vm_object_t object) +{ + assert(object->purgable == VM_PURGABLE_DENY); + + queue_remove(&vm_object_cached_list, object, vm_object_t, cached_list); + object->cached_list.next = NULL; + object->cached_list.prev = NULL; + + vm_object_cached_count--; +} + +void +vm_object_cache_remove( + vm_object_t object) +{ + vm_object_cache_lock_spin(); + + if (object->cached_list.next && + object->cached_list.prev) { + vm_object_cache_remove_locked(object); + } + + vm_object_cache_unlock(); +} + +void +vm_object_cache_add( + vm_object_t object) +{ + clock_sec_t sec; + clock_nsec_t nsec; + + assert(object->purgable == VM_PURGABLE_DENY); + + if (object->resident_page_count == 0) { + return; + } + clock_get_system_nanotime(&sec, &nsec); + + vm_object_cache_lock_spin(); + + if (object->cached_list.next == NULL && + object->cached_list.prev == NULL) { + queue_enter(&vm_object_cached_list, object, vm_object_t, cached_list); + object->vo_cache_ts = sec + EVICT_AGE; + object->vo_cache_pages_to_scan = object->resident_page_count; + + vm_object_cached_count++; + vm_object_cache_adds++; + } + vm_object_cache_unlock(); +} + +int +vm_object_cache_evict( + int num_to_evict, + int max_objects_to_examine) +{ + vm_object_t object = VM_OBJECT_NULL; + vm_object_t next_obj = VM_OBJECT_NULL; + vm_page_t local_free_q = VM_PAGE_NULL; + vm_page_t p; + vm_page_t next_p; + int object_cnt = 0; + vm_page_t ep_array[EVICT_PREPARE_LIMIT]; + int ep_count; + int ep_limit; + int ep_index; + int ep_freed = 0; + int ep_moved = 0; + uint32_t ep_skipped = 0; + clock_sec_t sec; + clock_nsec_t nsec; + + KERNEL_DEBUG(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0); + /* + * do a couple of quick checks to see if it's + * worthwhile grabbing the lock + */ + if (queue_empty(&vm_object_cached_list)) { + KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, 0, 0, 0, 0, 0); + return 0; + } + clock_get_system_nanotime(&sec, &nsec); + + /* + * the object on the head of the queue has not + * yet sufficiently aged + */ + if (sec < vm_object_cache_aging_ts) { + KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, 0, 0, 0, 0, 0); + return 0; + } + /* + * don't need the queue lock to find + * and lock an object on the cached list + */ + vm_page_unlock_queues(); + + vm_object_cache_lock_spin(); + + for (;;) { + next_obj = (vm_object_t)queue_first(&vm_object_cached_list); + + while (!queue_end(&vm_object_cached_list, (queue_entry_t)next_obj) && object_cnt++ < max_objects_to_examine) { + object = next_obj; + next_obj = (vm_object_t)queue_next(&next_obj->cached_list); + + assert(object->purgable == VM_PURGABLE_DENY); + + if (sec < object->vo_cache_ts) { + KERNEL_DEBUG(0x130020c, object, object->resident_page_count, object->vo_cache_ts, sec, 0); + + vm_object_cache_aging_ts = object->vo_cache_ts; + object = VM_OBJECT_NULL; + break; + } + if (!vm_object_lock_try_scan(object)) { + /* + * just skip over this guy for now... if we find + * an object to steal pages from, we'll revist in a bit... + * hopefully, the lock will have cleared + */ + KERNEL_DEBUG(0x13001f8, object, object->resident_page_count, 0, 0, 0); + + object = VM_OBJECT_NULL; + continue; + } + if (vm_page_queue_empty(&object->memq) || object->vo_cache_pages_to_scan == 0) { + /* + * this case really shouldn't happen, but it's not fatal + * so deal with it... if we don't remove the object from + * the list, we'll never move past it. + */ + KERNEL_DEBUG(0x13001fc, object, object->resident_page_count, ep_freed, ep_moved, 0); + + vm_object_cache_remove_locked(object); + vm_object_unlock(object); + object = VM_OBJECT_NULL; + continue; + } + /* + * we have a locked object with pages... + * time to start harvesting + */ + break; + } + vm_object_cache_unlock(); + + if (object == VM_OBJECT_NULL) { + break; + } + + /* + * object is locked at this point and + * has resident pages + */ + next_p = (vm_page_t)vm_page_queue_first(&object->memq); + + /* + * break the page scan into 2 pieces to minimize the time spent + * behind the page queue lock... + * the list of pages on these unused objects is likely to be cold + * w/r to the cpu cache which increases the time to scan the list + * tenfold... and we may have a 'run' of pages we can't utilize that + * needs to be skipped over... + */ + if ((ep_limit = num_to_evict - (ep_freed + ep_moved)) > EVICT_PREPARE_LIMIT) { + ep_limit = EVICT_PREPARE_LIMIT; + } + ep_count = 0; + + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next_p) && object->vo_cache_pages_to_scan && ep_count < ep_limit) { + p = next_p; + next_p = (vm_page_t)vm_page_queue_next(&next_p->vmp_listq); + + object->vo_cache_pages_to_scan--; + + if (VM_PAGE_WIRED(p) || p->vmp_busy || p->vmp_cleaning || p->vmp_laundry) { + vm_page_queue_remove(&object->memq, p, vmp_listq); + vm_page_queue_enter(&object->memq, p, vmp_listq); + + ep_skipped++; + continue; + } + if (p->vmp_wpmapped || p->vmp_dirty || p->vmp_precious) { + vm_page_queue_remove(&object->memq, p, vmp_listq); + vm_page_queue_enter(&object->memq, p, vmp_listq); + + pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(p)); + } + ep_array[ep_count++] = p; + } + KERNEL_DEBUG(0x13001f4 | DBG_FUNC_START, object, object->resident_page_count, ep_freed, ep_moved, 0); + + vm_page_lockspin_queues(); + + for (ep_index = 0; ep_index < ep_count; ep_index++) { + p = ep_array[ep_index]; + + if (p->vmp_wpmapped || p->vmp_dirty || p->vmp_precious) { + p->vmp_reference = FALSE; + p->vmp_no_cache = FALSE; + + /* + * we've already filtered out pages that are in the laundry + * so if we get here, this page can't be on the pageout queue + */ + vm_page_queues_remove(p, FALSE); + vm_page_enqueue_inactive(p, TRUE); + + ep_moved++; + } else { +#if CONFIG_PHANTOM_CACHE + vm_phantom_cache_add_ghost(p); +#endif + vm_page_free_prepare_queues(p); + + assert(p->vmp_pageq.next == 0 && p->vmp_pageq.prev == 0); + /* + * Add this page to our list of reclaimed pages, + * to be freed later. + */ + p->vmp_snext = local_free_q; + local_free_q = p; + + ep_freed++; + } + } + vm_page_unlock_queues(); + + KERNEL_DEBUG(0x13001f4 | DBG_FUNC_END, object, object->resident_page_count, ep_freed, ep_moved, 0); + + if (local_free_q) { + vm_page_free_list(local_free_q, TRUE); + local_free_q = VM_PAGE_NULL; + } + if (object->vo_cache_pages_to_scan == 0) { + KERNEL_DEBUG(0x1300208, object, object->resident_page_count, ep_freed, ep_moved, 0); + + vm_object_cache_remove(object); + + KERNEL_DEBUG(0x13001fc, object, object->resident_page_count, ep_freed, ep_moved, 0); + } + /* + * done with this object + */ + vm_object_unlock(object); + object = VM_OBJECT_NULL; + + /* + * at this point, we are not holding any locks + */ + if ((ep_freed + ep_moved) >= num_to_evict) { + /* + * we've reached our target for the + * number of pages to evict + */ + break; + } + vm_object_cache_lock_spin(); + } + /* + * put the page queues lock back to the caller's + * idea of it + */ + vm_page_lock_queues(); + + vm_object_cache_pages_freed += ep_freed; + vm_object_cache_pages_moved += ep_moved; + vm_object_cache_pages_skipped += ep_skipped; + + KERNEL_DEBUG(0x13001ec | DBG_FUNC_END, ep_freed, 0, 0, 0, 0); + return ep_freed; +} + +/* + * Routine: vm_object_terminate * Purpose: * Free all resources associated with a vm_object. * In/out conditions: @@ -940,18 +1320,14 @@ boolean_t vm_object_terminate_remove_all = FALSE; */ static kern_return_t vm_object_terminate( - register vm_object_t object) + vm_object_t object) { - register vm_page_t p; - vm_object_t shadow_object; + vm_object_t shadow_object; - XPR(XPR_VM_OBJECT, "vm_object_terminate, object 0x%X ref %d\n", - (integer_t)object, object->ref_count, 0, 0, 0); + vm_object_lock_assert_exclusive(object); - if (!object->pageout && (!object->temporary || object->can_persist) - && (object->pager != NULL || object->shadow_severed)) { - vm_object_cache_unlock(); - while (!queue_empty(&object->memq)) { + if (!object->pageout && (!object->internal && object->can_persist) && + (object->pager != NULL || object->shadow_severed)) { /* * Clear pager_trusted bit so that the pages get yanked * out of the object instead of cleaned in place. This @@ -959,66 +1335,15 @@ vm_object_terminate( */ object->pager_trusted = FALSE; - p = (vm_page_t) queue_first(&object->memq); - - VM_PAGE_CHECK(p); - - if (p->busy || p->cleaning) { - if(p->cleaning || p->absent) { - vm_object_paging_wait(object, THREAD_UNINT); - continue; - } else { - panic("vm_object_terminate.3 0x%x 0x%x", object, p); - } - } - - vm_page_lock_queues(); - p->busy = TRUE; - VM_PAGE_QUEUES_REMOVE(p); - vm_page_unlock_queues(); - - if (p->absent || p->private) { - - /* - * For private pages, VM_PAGE_FREE just - * leaves the page structure around for - * its owner to clean up. For absent - * pages, the structure is returned to - * the appropriate pool. - */ - - goto free_page; - } - - if (p->fictitious) - panic("vm_object_terminate.4 0x%x 0x%x", object, p); - - if (!p->dirty) - p->dirty = pmap_is_modified(p->phys_page); - - if ((p->dirty || p->precious) && !p->error && object->alive) { - vm_pageout_cluster(p); /* flush page */ - vm_object_paging_wait(object, THREAD_UNINT); - XPR(XPR_VM_OBJECT, - "vm_object_terminate restart, object 0x%X ref %d\n", - (integer_t)object, object->ref_count, 0, 0, 0); - } else { - free_page: - VM_PAGE_FREE(p); - } - } - vm_object_unlock(object); - vm_object_cache_lock(); - vm_object_lock(object); + vm_object_reap_pages(object, REAP_TERMINATE); } - /* * Make sure the object isn't already being terminated */ - if(object->terminating) { - object->ref_count -= 1; + if (object->terminating) { + vm_object_lock_assert_exclusive(object); + object->ref_count--; assert(object->ref_count > 0); - vm_object_cache_unlock(); vm_object_unlock(object); return KERN_FAILURE; } @@ -1027,11 +1352,11 @@ vm_object_terminate( * Did somebody get a reference to the object while we were * cleaning it? */ - if(object->ref_count != 1) { - object->ref_count -= 1; + if (object->ref_count != 1) { + vm_object_lock_assert_exclusive(object); + object->ref_count--; assert(object->ref_count > 0); vm_object_res_deallocate(object); - vm_object_cache_unlock(); vm_object_unlock(object); return KERN_FAILURE; } @@ -1042,7 +1367,12 @@ vm_object_terminate( object->terminating = TRUE; object->alive = FALSE; - vm_object_remove(object); + + if (!object->internal && + object->cached_list.next && + object->cached_list.prev) { + vm_object_cache_remove(object); + } /* * Detach the object from its shadow if we are the shadow's @@ -1052,12 +1382,14 @@ vm_object_terminate( if (((shadow_object = object->shadow) != VM_OBJECT_NULL) && !(object->pageout)) { vm_object_lock(shadow_object); - if (shadow_object->copy == object) + if (shadow_object->copy == object) { shadow_object->copy = VM_OBJECT_NULL; + } vm_object_unlock(shadow_object); } - if (object->paging_in_progress != 0) { + if (object->paging_in_progress != 0 || + object->activity_in_progress != 0) { /* * There are still some paging_in_progress references * on this object, meaning that there are some paging @@ -1080,7 +1412,6 @@ vm_object_terminate( * VM object is "terminating" and not "alive". */ vm_object_reap_async(object); - vm_object_cache_unlock(); vm_object_unlock(object); /* * Return KERN_FAILURE to let the caller know that we @@ -1091,93 +1422,179 @@ vm_object_terminate( */ return KERN_FAILURE; } - - /* complete the VM object termination */ + /* + * complete the VM object termination + */ vm_object_reap(object); object = VM_OBJECT_NULL; - /* cache lock and object lock were released by vm_object_reap() */ + /* + * the object lock was released by vm_object_reap() + * + * KERN_SUCCESS means that this object has been terminated + * and no longer needs its shadow object but still holds a + * reference on it. + * The caller is responsible for dropping that reference. + * We can't call vm_object_deallocate() here because that + * would create a recursion. + */ return KERN_SUCCESS; } + /* * vm_object_reap(): * * Complete the termination of a VM object after it's been marked * as "terminating" and "!alive" by vm_object_terminate(). * - * The VM object cache and the VM object must be locked by caller. - * The locks will be released on return and the VM object is no longer valid. + * The VM object must be locked by caller. + * The lock will be released on return and the VM object is no longer valid. */ + void vm_object_reap( vm_object_t object) { - memory_object_t pager; - vm_page_t p; + memory_object_t pager; -#if DEBUG - mutex_assert(&vm_object_cached_lock_data, MA_OWNED); - mutex_assert(&object->Lock, MA_OWNED); -#endif /* DEBUG */ + vm_object_lock_assert_exclusive(object); + assert(object->paging_in_progress == 0); + assert(object->activity_in_progress == 0); vm_object_reap_count++; /* - * The pageout daemon might be playing with our pages. - * Now that the object is dead, it won't touch any more - * pages, but some pages might already be on their way out. - * Hence, we wait until the active paging activities have - * ceased before we break the association with the pager - * itself. + * Disown this purgeable object to cleanup its owner's purgeable + * ledgers. We need to do this before disconnecting the object + * from its pager, to properly account for compressed pages. */ - while (object->paging_in_progress != 0) { - vm_object_cache_unlock(); - vm_object_wait(object, - VM_OBJECT_EVENT_PAGING_IN_PROGRESS, - THREAD_UNINT); - vm_object_cache_lock(); - vm_object_lock(object); + if (object->internal && + (object->purgable != VM_PURGABLE_DENY || + object->vo_ledger_tag)) { + int ledger_flags; + kern_return_t kr; + + ledger_flags = 0; + if (object->vo_no_footprint) { + ledger_flags |= VM_LEDGER_FLAG_NO_FOOTPRINT; + } + assert(!object->alive); + assert(object->terminating); + kr = vm_object_ownership_change(object, + object->vo_ledger_tag, /* unchanged */ + NULL, /* no owner */ + ledger_flags, + FALSE); /* task_objq not locked */ + assert(kr == KERN_SUCCESS); + assert(object->vo_owner == NULL); } - assert(object->paging_in_progress == 0); pager = object->pager; object->pager = MEMORY_OBJECT_NULL; - if (pager != MEMORY_OBJECT_NULL) + if (pager != MEMORY_OBJECT_NULL) { memory_object_control_disable(object->pager_control); - vm_object_cache_unlock(); + } object->ref_count--; -#if TASK_SWAPPER +#if TASK_SWAPPER assert(object->res_count == 0); -#endif /* TASK_SWAPPER */ +#endif /* TASK_SWAPPER */ - assert (object->ref_count == 0); + assert(object->ref_count == 0); /* - * Clean or free the pages, as appropriate. - * It is possible for us to find busy/absent pages, - * if some faults on this object were aborted. + * remove from purgeable queue if it's on */ - if (object->pageout) { - assert(object->shadow != VM_OBJECT_NULL); + if (object->internal) { + assert(VM_OBJECT_OWNER(object) == TASK_NULL); - vm_pageout_object_terminate(object); + VM_OBJECT_UNWIRED(object); - } else if ((object->temporary && !object->can_persist) || - (pager == MEMORY_OBJECT_NULL)) { - while (!queue_empty(&object->memq)) { - p = (vm_page_t) queue_first(&object->memq); + if (object->purgable == VM_PURGABLE_DENY) { + /* not purgeable: nothing to do */ + } else if (object->purgable == VM_PURGABLE_VOLATILE) { + purgeable_q_t queue; - VM_PAGE_CHECK(p); - VM_PAGE_FREE(p); + queue = vm_purgeable_object_remove(object); + assert(queue); + + if (object->purgeable_when_ripe) { + /* + * Must take page lock for this - + * using it to protect token queue + */ + vm_page_lock_queues(); + vm_purgeable_token_delete_first(queue); + + assert(queue->debug_count_objects >= 0); + vm_page_unlock_queues(); + } + + /* + * Update "vm_page_purgeable_count" in bulk and mark + * object as VM_PURGABLE_EMPTY to avoid updating + * "vm_page_purgeable_count" again in vm_page_remove() + * when reaping the pages. + */ + unsigned int delta; + assert(object->resident_page_count >= + object->wired_page_count); + delta = (object->resident_page_count - + object->wired_page_count); + if (delta != 0) { + assert(vm_page_purgeable_count >= delta); + OSAddAtomic(-delta, + (SInt32 *)&vm_page_purgeable_count); + } + if (object->wired_page_count != 0) { + assert(vm_page_purgeable_wired_count >= + object->wired_page_count); + OSAddAtomic(-object->wired_page_count, + (SInt32 *)&vm_page_purgeable_wired_count); + } + object->purgable = VM_PURGABLE_EMPTY; + } else if (object->purgable == VM_PURGABLE_NONVOLATILE || + object->purgable == VM_PURGABLE_EMPTY) { + /* remove from nonvolatile queue */ + vm_purgeable_nonvolatile_dequeue(object); + } else { + panic("object %p in unexpected purgeable state 0x%x\n", + object, object->purgable); + } + if (object->transposed && + object->cached_list.next != NULL && + object->cached_list.prev == NULL) { + /* + * object->cached_list.next "points" to the + * object that was transposed with this object. + */ + } else { + assert(object->cached_list.next == NULL); } - } else if (!queue_empty(&object->memq)) { - panic("vm_object_reap: queue just emptied isn't"); + assert(object->cached_list.prev == NULL); } + if (object->pageout) { + /* + * free all remaining pages tabled on + * this object + * clean up it's shadow + */ + assert(object->shadow != VM_OBJECT_NULL); + + vm_pageout_object_terminate(object); + } else if (object->resident_page_count) { + /* + * free all remaining pages tabled on + * this object + */ + vm_object_reap_pages(object, REAP_REAP); + } + assert(vm_page_queue_empty(&object->memq)); assert(object->paging_in_progress == 0); + assert(object->activity_in_progress == 0); assert(object->ref_count == 0); /* @@ -1197,12 +1614,16 @@ vm_object_reap( vm_object_paging_end(object); vm_object_unlock(object); -#if MACH_PAGEMAP - vm_external_destroy(object->existence_map, object->size); -#endif /* MACH_PAGEMAP */ - object->shadow = VM_OBJECT_NULL; +#if VM_OBJECT_TRACKING + if (vm_object_tracking_inited) { + btlog_remove_entries_for_element(vm_object_tracking_btlog, + object); + } +#endif /* VM_OBJECT_TRACKING */ + + vm_object_lock_destroy(object); /* * Free the space for the object. */ @@ -1210,119 +1631,351 @@ vm_object_reap( object = VM_OBJECT_NULL; } -void -vm_object_reap_async( - vm_object_t object) -{ -#if DEBUG - mutex_assert(&vm_object_cached_lock_data, MA_OWNED); - mutex_assert(&object->Lock, MA_OWNED); -#endif /* DEBUG */ - vm_object_reap_count_async++; +unsigned int vm_max_batch = 256; - /* enqueue the VM object... */ - queue_enter(&vm_object_reaper_queue, object, - vm_object_t, cached_list); - /* ... and wake up the reaper thread */ - thread_wakeup((event_t) &vm_object_reaper_queue); -} +#define V_O_R_MAX_BATCH 128 -void -vm_object_reaper_thread(void) -{ - vm_object_t object, shadow_object; +#define BATCH_LIMIT(max) (vm_max_batch >= max ? max : vm_max_batch) - vm_object_cache_lock(); - while (!queue_empty(&vm_object_reaper_queue)) { - queue_remove_first(&vm_object_reaper_queue, - object, - vm_object_t, - cached_list); - vm_object_lock(object); - assert(object->terminating); - assert(!object->alive); +#define VM_OBJ_REAP_FREELIST(_local_free_q, do_disconnect) \ + MACRO_BEGIN \ + if (_local_free_q) { \ + if (do_disconnect) { \ + vm_page_t m; \ + for (m = _local_free_q; \ + m != VM_PAGE_NULL; \ + m = m->vmp_snext) { \ + if (m->vmp_pmapped) { \ + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); \ + } \ + } \ + } \ + vm_page_free_list(_local_free_q, TRUE); \ + _local_free_q = VM_PAGE_NULL; \ + } \ + MACRO_END - shadow_object = - object->pageout ? VM_OBJECT_NULL : object->shadow; - vm_object_reap(object); - /* cache is unlocked and object is no longer valid */ - object = VM_OBJECT_NULL; +void +vm_object_reap_pages( + vm_object_t object, + int reap_type) +{ + vm_page_t p; + vm_page_t next; + vm_page_t local_free_q = VM_PAGE_NULL; + int loop_count; + boolean_t disconnect_on_release; + pmap_flush_context pmap_flush_context_storage; + + if (reap_type == REAP_DATA_FLUSH) { + /* + * We need to disconnect pages from all pmaps before + * releasing them to the free list + */ + disconnect_on_release = TRUE; + } else { + /* + * Either the caller has already disconnected the pages + * from all pmaps, or we disconnect them here as we add + * them to out local list of pages to be released. + * No need to re-disconnect them when we release the pages + * to the free list. + */ + disconnect_on_release = FALSE; + } - if (shadow_object != VM_OBJECT_NULL) { - /* - * Drop the reference "object" was holding on - * its shadow object. - */ - vm_object_deallocate(shadow_object); - shadow_object = VM_OBJECT_NULL; - } +restart_after_sleep: + if (vm_page_queue_empty(&object->memq)) { + return; + } + loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH); - vm_object_cache_lock(); + if (reap_type == REAP_PURGEABLE) { + pmap_flush_context_init(&pmap_flush_context_storage); } - /* wait for more work... */ - assert_wait((event_t) &vm_object_reaper_queue, THREAD_UNINT); - vm_object_cache_unlock(); - thread_block((thread_continue_t) vm_object_reaper_thread); - /*NOTREACHED*/ -} + vm_page_lockspin_queues(); -/* - * Routine: vm_object_pager_wakeup - * Purpose: Wake up anyone waiting for termination of a pager. - */ + next = (vm_page_t)vm_page_queue_first(&object->memq); -static void -vm_object_pager_wakeup( - memory_object_t pager) -{ - vm_object_hash_entry_t entry; - boolean_t waiting = FALSE; + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next)) { + p = next; + next = (vm_page_t)vm_page_queue_next(&next->vmp_listq); + + if (--loop_count == 0) { + vm_page_unlock_queues(); + + if (local_free_q) { + if (reap_type == REAP_PURGEABLE) { + pmap_flush(&pmap_flush_context_storage); + pmap_flush_context_init(&pmap_flush_context_storage); + } + /* + * Free the pages we reclaimed so far + * and take a little break to avoid + * hogging the page queue lock too long + */ + VM_OBJ_REAP_FREELIST(local_free_q, + disconnect_on_release); + } else { + mutex_pause(0); + } + + loop_count = BATCH_LIMIT(V_O_R_MAX_BATCH); + + vm_page_lockspin_queues(); + } + if (reap_type == REAP_DATA_FLUSH || reap_type == REAP_TERMINATE) { + if (p->vmp_busy || p->vmp_cleaning) { + vm_page_unlock_queues(); + /* + * free the pages reclaimed so far + */ + VM_OBJ_REAP_FREELIST(local_free_q, + disconnect_on_release); + + PAGE_SLEEP(object, p, THREAD_UNINT); + + goto restart_after_sleep; + } + if (p->vmp_laundry) { + vm_pageout_steal_laundry(p, TRUE); + } + } + switch (reap_type) { + case REAP_DATA_FLUSH: + if (VM_PAGE_WIRED(p)) { + /* + * this is an odd case... perhaps we should + * zero-fill this page since we're conceptually + * tossing its data at this point, but leaving + * it on the object to honor the 'wire' contract + */ + continue; + } + break; + + case REAP_PURGEABLE: + if (VM_PAGE_WIRED(p)) { + /* + * can't purge a wired page + */ + vm_page_purged_wired++; + continue; + } + if (p->vmp_laundry && !p->vmp_busy && !p->vmp_cleaning) { + vm_pageout_steal_laundry(p, TRUE); + } + + if (p->vmp_cleaning || p->vmp_laundry || p->vmp_absent) { + /* + * page is being acted upon, + * so don't mess with it + */ + vm_page_purged_others++; + continue; + } + if (p->vmp_busy) { + /* + * We can't reclaim a busy page but we can + * make it more likely to be paged (it's not wired) to make + * sure that it gets considered by + * vm_pageout_scan() later. + */ + if (VM_PAGE_PAGEABLE(p)) { + vm_page_deactivate(p); + } + vm_page_purged_busy++; + continue; + } + + assert(VM_PAGE_OBJECT(p) != kernel_object); + + /* + * we can discard this page... + */ + if (p->vmp_pmapped == TRUE) { + /* + * unmap the page + */ + pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(p), PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_NOREFMOD, (void *)&pmap_flush_context_storage); + } + vm_page_purged_count++; + + break; + + case REAP_TERMINATE: + if (p->vmp_absent || p->vmp_private) { + /* + * For private pages, VM_PAGE_FREE just + * leaves the page structure around for + * its owner to clean up. For absent + * pages, the structure is returned to + * the appropriate pool. + */ + break; + } + if (p->vmp_fictitious) { + assert(VM_PAGE_GET_PHYS_PAGE(p) == vm_page_guard_addr); + break; + } + if (!p->vmp_dirty && p->vmp_wpmapped) { + p->vmp_dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)); + } + + if ((p->vmp_dirty || p->vmp_precious) && !p->vmp_error && object->alive) { + assert(!object->internal); + + p->vmp_free_when_done = TRUE; + + if (!p->vmp_laundry) { + vm_page_queues_remove(p, TRUE); + /* + * flush page... page will be freed + * upon completion of I/O + */ + vm_pageout_cluster(p); + } + vm_page_unlock_queues(); + /* + * free the pages reclaimed so far + */ + VM_OBJ_REAP_FREELIST(local_free_q, + disconnect_on_release); + + vm_object_paging_wait(object, THREAD_UNINT); + + goto restart_after_sleep; + } + break; + + case REAP_REAP: + break; + } + vm_page_free_prepare_queues(p); + assert(p->vmp_pageq.next == 0 && p->vmp_pageq.prev == 0); + /* + * Add this page to our list of reclaimed pages, + * to be freed later. + */ + p->vmp_snext = local_free_q; + local_free_q = p; + } + vm_page_unlock_queues(); /* - * If anyone was waiting for the memory_object_terminate - * to be queued, wake them up now. + * Free the remaining reclaimed pages */ - vm_object_cache_lock(); - entry = vm_object_hash_lookup(pager, TRUE); - if (entry != VM_OBJECT_HASH_ENTRY_NULL) - waiting = entry->waiting; - vm_object_cache_unlock(); - if (entry != VM_OBJECT_HASH_ENTRY_NULL) { - if (waiting) - thread_wakeup((event_t) pager); - vm_object_hash_entry_free(entry); + if (reap_type == REAP_PURGEABLE) { + pmap_flush(&pmap_flush_context_storage); + } + + VM_OBJ_REAP_FREELIST(local_free_q, + disconnect_on_release); +} + + +void +vm_object_reap_async( + vm_object_t object) +{ + vm_object_lock_assert_exclusive(object); + + vm_object_reaper_lock_spin(); + + vm_object_reap_count_async++; + + /* enqueue the VM object... */ + queue_enter(&vm_object_reaper_queue, object, + vm_object_t, cached_list); + + vm_object_reaper_unlock(); + + /* ... and wake up the reaper thread */ + thread_wakeup((event_t) &vm_object_reaper_queue); +} + + +void +vm_object_reaper_thread(void) +{ + vm_object_t object, shadow_object; + + vm_object_reaper_lock_spin(); + + while (!queue_empty(&vm_object_reaper_queue)) { + queue_remove_first(&vm_object_reaper_queue, + object, + vm_object_t, + cached_list); + + vm_object_reaper_unlock(); + vm_object_lock(object); + + assert(object->terminating); + assert(!object->alive); + + /* + * The pageout daemon might be playing with our pages. + * Now that the object is dead, it won't touch any more + * pages, but some pages might already be on their way out. + * Hence, we wait until the active paging activities have + * ceased before we break the association with the pager + * itself. + */ + while (object->paging_in_progress != 0 || + object->activity_in_progress != 0) { + vm_object_wait(object, + VM_OBJECT_EVENT_PAGING_IN_PROGRESS, + THREAD_UNINT); + vm_object_lock(object); + } + + shadow_object = + object->pageout ? VM_OBJECT_NULL : object->shadow; + + vm_object_reap(object); + /* cache is unlocked and object is no longer valid */ + object = VM_OBJECT_NULL; + + if (shadow_object != VM_OBJECT_NULL) { + /* + * Drop the reference "object" was holding on + * its shadow object. + */ + vm_object_deallocate(shadow_object); + shadow_object = VM_OBJECT_NULL; + } + vm_object_reaper_lock_spin(); } + + /* wait for more work... */ + assert_wait((event_t) &vm_object_reaper_queue, THREAD_UNINT); + + vm_object_reaper_unlock(); + + thread_block((thread_continue_t) vm_object_reaper_thread); + /*NOTREACHED*/ } /* * Routine: vm_object_release_pager * Purpose: Terminate the pager and, upon completion, * release our last reference to it. - * just like memory_object_terminate, except - * that we wake up anyone blocked in vm_object_enter - * waiting for termination message to be queued - * before calling memory_object_init. */ static void vm_object_release_pager( - memory_object_t pager) + memory_object_t pager) { - /* * Terminate the pager. */ (void) memory_object_terminate(pager); - /* - * Wakeup anyone waiting for this terminate - */ - vm_object_pager_wakeup(pager); - /* * Release reference to pager. */ @@ -1338,13 +1991,14 @@ vm_object_release_pager( */ kern_return_t vm_object_destroy( - vm_object_t object, - __unused kern_return_t reason) + vm_object_t object, + __unused kern_return_t reason) { - memory_object_t old_pager; + memory_object_t old_pager; - if (object == VM_OBJECT_NULL) - return(KERN_SUCCESS); + if (object == VM_OBJECT_NULL) { + return KERN_SUCCESS; + } /* * Remove the pager association immediately. @@ -1355,22 +2009,16 @@ vm_object_destroy( * the destroy call.] */ - vm_object_cache_lock(); vm_object_lock(object); object->can_persist = FALSE; object->named = FALSE; object->alive = FALSE; - /* - * Rip out the pager from the vm_object now... - */ - - vm_object_remove(object); old_pager = object->pager; object->pager = MEMORY_OBJECT_NULL; - if (old_pager != MEMORY_OBJECT_NULL) + if (old_pager != MEMORY_OBJECT_NULL) { memory_object_control_disable(object->pager_control); - vm_object_cache_unlock(); + } /* * Wait for the existing paging activity (that got @@ -1386,7 +2034,7 @@ vm_object_destroy( if (old_pager != MEMORY_OBJECT_NULL) { vm_object_release_pager(old_pager); - /* + /* * JMM - Release the caller's reference. This assumes the * caller had a reference to release, which is a big (but * currently valid) assumption if this is driven from the @@ -1394,123 +2042,575 @@ vm_object_destroy( * this call).. */ vm_object_deallocate(object); - } - return(KERN_SUCCESS); + return KERN_SUCCESS; } /* - * vm_object_deactivate_pages + * The "chunk" macros are used by routines below when looking for pages to deactivate. These + * exist because of the need to handle shadow chains. When deactivating pages, we only + * want to deactive the ones at the top most level in the object chain. In order to do + * this efficiently, the specified address range is divided up into "chunks" and we use + * a bit map to keep track of which pages have already been processed as we descend down + * the shadow chain. These chunk macros hide the details of the bit map implementation + * as much as we can. * - * Deactivate all pages in the specified object. (Keep its pages - * in memory even though it is no longer referenced.) + * For convenience, we use a 64-bit data type as the bit map, and therefore a chunk is + * set to 64 pages. The bit map is indexed from the low-order end, so that the lowest + * order bit represents page 0 in the current range and highest order bit represents + * page 63. * - * The object must be locked. + * For further convenience, we also use negative logic for the page state in the bit map. + * The bit is set to 1 to indicate it has not yet been seen, and to 0 to indicate it has + * been processed. This way we can simply test the 64-bit long word to see if it's zero + * to easily tell if the whole range has been processed. Therefore, the bit map starts + * out with all the bits set. The macros below hide all these details from the caller. */ -static void -vm_object_deactivate_all_pages( - register vm_object_t object) -{ - register vm_page_t p; - queue_iterate(&object->memq, p, vm_page_t, listq) { - vm_page_lock_queues(); - if (!p->busy) - vm_page_deactivate(p); - vm_page_unlock_queues(); +#define PAGES_IN_A_CHUNK 64 /* The number of pages in the chunk must */ + /* be the same as the number of bits in */ + /* the chunk_state_t type. We use 64 */ + /* just for convenience. */ + +#define CHUNK_SIZE (PAGES_IN_A_CHUNK * PAGE_SIZE_64) /* Size of a chunk in bytes */ + +typedef uint64_t chunk_state_t; + +/* + * The bit map uses negative logic, so we start out with all 64 bits set to indicate + * that no pages have been processed yet. Also, if len is less than the full CHUNK_SIZE, + * then we mark pages beyond the len as having been "processed" so that we don't waste time + * looking at pages in that range. This can save us from unnecessarily chasing down the + * shadow chain. + */ + +#define CHUNK_INIT(c, len) \ + MACRO_BEGIN \ + uint64_t p; \ + \ + (c) = 0xffffffffffffffffLL; \ + \ + for (p = (len) / PAGE_SIZE_64; p < PAGES_IN_A_CHUNK; p++) \ + MARK_PAGE_HANDLED(c, p); \ + MACRO_END + + +/* + * Return true if all pages in the chunk have not yet been processed. + */ + +#define CHUNK_NOT_COMPLETE(c) ((c) != 0) + +/* + * Return true if the page at offset 'p' in the bit map has already been handled + * while processing a higher level object in the shadow chain. + */ + +#define PAGE_ALREADY_HANDLED(c, p) (((c) & (1ULL << (p))) == 0) + +/* + * Mark the page at offset 'p' in the bit map as having been processed. + */ + +#define MARK_PAGE_HANDLED(c, p) \ +MACRO_BEGIN \ + (c) = (c) & ~(1ULL << (p)); \ +MACRO_END + + +/* + * Return true if the page at the given offset has been paged out. Object is + * locked upon entry and returned locked. + */ + +static boolean_t +page_is_paged_out( + vm_object_t object, + vm_object_offset_t offset) +{ + if (object->internal && + object->alive && + !object->terminating && + object->pager_ready) { + if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) + == VM_EXTERNAL_STATE_EXISTS) { + return TRUE; + } } + return FALSE; } -__private_extern__ void -vm_object_deactivate_pages( - vm_object_t object, - vm_object_offset_t offset, - vm_object_size_t size, - boolean_t kill_page) + + +/* + * madvise_free_debug + * + * To help debug madvise(MADV_FREE*) mis-usage, this triggers a + * zero-fill as soon as a page is affected by a madvise(MADV_FREE*), to + * simulate the loss of the page's contents as if the page had been + * reclaimed and then re-faulted. + */ +#if DEVELOPMENT || DEBUG +int madvise_free_debug = 1; +#else /* DEBUG */ +int madvise_free_debug = 0; +#endif /* DEBUG */ + +/* + * Deactivate the pages in the specified object and range. If kill_page is set, also discard any + * page modified state from the pmap. Update the chunk_state as we go along. The caller must specify + * a size that is less than or equal to the CHUNK_SIZE. + */ + +static void +deactivate_pages_in_object( + vm_object_t object, + vm_object_offset_t offset, + vm_object_size_t size, + boolean_t kill_page, + boolean_t reusable_page, + boolean_t all_reusable, + chunk_state_t *chunk_state, + pmap_flush_context *pfc, + struct pmap *pmap, + vm_map_offset_t pmap_offset) { - vm_object_t orig_object; - int pages_moved = 0; - int pages_found = 0; + vm_page_t m; + int p; + struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; + struct vm_page_delayed_work *dwp; + int dw_count; + int dw_limit; + unsigned int reusable = 0; /* - * entered with object lock held, acquire a paging reference to - * prevent the memory_object and control ports from - * being destroyed. + * Examine each page in the chunk. The variable 'p' is the page number relative to the start of the + * chunk. Since this routine is called once for each level in the shadow chain, the chunk_state may + * have pages marked as having been processed already. We stop the loop early if we find we've handled + * all the pages in the chunk. */ - orig_object = object; - for (;;) { - register vm_page_t m; - vm_object_offset_t toffset; - vm_object_size_t tsize; + dwp = &dw_array[0]; + dw_count = 0; + dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); - vm_object_paging_begin(object); - vm_page_lock_queues(); + for (p = 0; size && CHUNK_NOT_COMPLETE(*chunk_state); p++, size -= PAGE_SIZE_64, offset += PAGE_SIZE_64, pmap_offset += PAGE_SIZE_64) { + /* + * If this offset has already been found and handled in a higher level object, then don't + * do anything with it in the current shadow object. + */ + + if (PAGE_ALREADY_HANDLED(*chunk_state, p)) { + continue; + } - for (tsize = size, toffset = offset; tsize; tsize -= PAGE_SIZE, toffset += PAGE_SIZE) { + /* + * See if the page at this offset is around. First check to see if the page is resident, + * then if not, check the existence map or with the pager. + */ - if ((m = vm_page_lookup(object, toffset)) != VM_PAGE_NULL) { + if ((m = vm_page_lookup(object, offset)) != VM_PAGE_NULL) { + /* + * We found a page we were looking for. Mark it as "handled" now in the chunk_state + * so that we won't bother looking for a page at this offset again if there are more + * shadow objects. Then deactivate the page. + */ - pages_found++; + MARK_PAGE_HANDLED(*chunk_state, p); - if ((m->wire_count == 0) && (!m->private) && (!m->gobbled) && (!m->busy)) { + if ((!VM_PAGE_WIRED(m)) && (!m->vmp_private) && (!m->vmp_gobbled) && (!m->vmp_busy) && + (!m->vmp_laundry) && (!m->vmp_cleaning) && !(m->vmp_free_when_done)) { + int clear_refmod; + int pmap_options; - assert(!m->laundry); + dwp->dw_mask = 0; - m->reference = FALSE; - pmap_clear_reference(m->phys_page); + pmap_options = 0; + clear_refmod = VM_MEM_REFERENCED; + dwp->dw_mask |= DW_clear_reference; - if ((kill_page) && (object->internal)) { - m->precious = FALSE; - m->dirty = FALSE; - pmap_clear_modify(m->phys_page); - vm_external_state_clr(object->existence_map, offset); + if ((kill_page) && (object->internal)) { + if (madvise_free_debug) { + /* + * zero-fill the page now + * to simulate it being + * reclaimed and re-faulted. + */ + pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m)); } - VM_PAGE_QUEUES_REMOVE(m); - - assert(!m->laundry); - assert(m->object != kernel_object); - assert(m->pageq.next == NULL && - m->pageq.prev == NULL); - if(m->zero_fill) { - queue_enter_first( - &vm_page_queue_zf, - m, vm_page_t, pageq); - } else { - queue_enter_first( - &vm_page_queue_inactive, - m, vm_page_t, pageq); + m->vmp_precious = FALSE; + m->vmp_dirty = FALSE; + + clear_refmod |= VM_MEM_MODIFIED; + if (m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) { + /* + * This page is now clean and + * reclaimable. Move it out + * of the throttled queue, so + * that vm_pageout_scan() can + * find it. + */ + dwp->dw_mask |= DW_move_page; + } + + VM_COMPRESSOR_PAGER_STATE_CLR(object, offset); + + if (reusable_page && !m->vmp_reusable) { + assert(!all_reusable); + assert(!object->all_reusable); + m->vmp_reusable = TRUE; + object->reusable_page_count++; + assert(object->resident_page_count >= object->reusable_page_count); + reusable++; + /* + * Tell pmap this page is now + * "reusable" (to update pmap + * stats for all mappings). + */ + pmap_options |= PMAP_OPTIONS_SET_REUSABLE; } + } + pmap_options |= PMAP_OPTIONS_NOFLUSH; + pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), + clear_refmod, + pmap_options, + (void *)pfc); + + if ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !(reusable_page || all_reusable)) { + dwp->dw_mask |= DW_move_page; + } + + if (dwp->dw_mask) { + VM_PAGE_ADD_DELAYED_WORK(dwp, m, + dw_count); + } - m->inactive = TRUE; - if (!m->fictitious) - vm_page_inactive_count++; + if (dw_count >= dw_limit) { + if (reusable) { + OSAddAtomic(reusable, + &vm_page_stats_reusable.reusable_count); + vm_page_stats_reusable.reusable += reusable; + reusable = 0; + } + vm_page_do_delayed_work(object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count); - pages_moved++; + dwp = &dw_array[0]; + dw_count = 0; } } - } - vm_page_unlock_queues(); - vm_object_paging_end(object); + } else { + /* + * The page at this offset isn't memory resident, check to see if it's + * been paged out. If so, mark it as handled so we don't bother looking + * for it in the shadow chain. + */ + + if (page_is_paged_out(object, offset)) { + MARK_PAGE_HANDLED(*chunk_state, p); - if (object->shadow) { - vm_object_t tmp_object; + /* + * If we're killing a non-resident page, then clear the page in the existence + * map so we don't bother paging it back in if it's touched again in the future. + */ - kill_page = 0; + if ((kill_page) && (object->internal)) { + VM_COMPRESSOR_PAGER_STATE_CLR(object, offset); - offset += object->shadow_offset; + if (pmap != PMAP_NULL) { + /* + * Tell pmap that this page + * is no longer mapped, to + * adjust the footprint ledger + * because this page is no + * longer compressed. + */ + pmap_remove_options( + pmap, + pmap_offset, + (pmap_offset + + PAGE_SIZE), + PMAP_OPTIONS_REMOVE); + } + } + } + } + } - tmp_object = object->shadow; - vm_object_lock(tmp_object); + if (reusable) { + OSAddAtomic(reusable, &vm_page_stats_reusable.reusable_count); + vm_page_stats_reusable.reusable += reusable; + reusable = 0; + } - if (object != orig_object) - vm_object_unlock(object); - object = tmp_object; - } else - break; + if (dw_count) { + vm_page_do_delayed_work(object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count); } - if (object != orig_object) - vm_object_unlock(object); +} + + +/* + * Deactive a "chunk" of the given range of the object starting at offset. A "chunk" + * will always be less than or equal to the given size. The total range is divided up + * into chunks for efficiency and performance related to the locks and handling the shadow + * chain. This routine returns how much of the given "size" it actually processed. It's + * up to the caler to loop and keep calling this routine until the entire range they want + * to process has been done. + */ + +static vm_object_size_t +deactivate_a_chunk( + vm_object_t orig_object, + vm_object_offset_t offset, + vm_object_size_t size, + boolean_t kill_page, + boolean_t reusable_page, + boolean_t all_reusable, + pmap_flush_context *pfc, + struct pmap *pmap, + vm_map_offset_t pmap_offset) +{ + vm_object_t object; + vm_object_t tmp_object; + vm_object_size_t length; + chunk_state_t chunk_state; + + + /* + * Get set to do a chunk. We'll do up to CHUNK_SIZE, but no more than the + * remaining size the caller asked for. + */ + + length = MIN(size, CHUNK_SIZE); + + /* + * The chunk_state keeps track of which pages we've already processed if there's + * a shadow chain on this object. At this point, we haven't done anything with this + * range of pages yet, so initialize the state to indicate no pages processed yet. + */ + + CHUNK_INIT(chunk_state, length); + object = orig_object; + + /* + * Start at the top level object and iterate around the loop once for each object + * in the shadow chain. We stop processing early if we've already found all the pages + * in the range. Otherwise we stop when we run out of shadow objects. + */ + + while (object && CHUNK_NOT_COMPLETE(chunk_state)) { + vm_object_paging_begin(object); + + deactivate_pages_in_object(object, offset, length, kill_page, reusable_page, all_reusable, &chunk_state, pfc, pmap, pmap_offset); + + vm_object_paging_end(object); + + /* + * We've finished with this object, see if there's a shadow object. If + * there is, update the offset and lock the new object. We also turn off + * kill_page at this point since we only kill pages in the top most object. + */ + + tmp_object = object->shadow; + + if (tmp_object) { + kill_page = FALSE; + reusable_page = FALSE; + all_reusable = FALSE; + offset += object->vo_shadow_offset; + vm_object_lock(tmp_object); + } + + if (object != orig_object) { + vm_object_unlock(object); + } + + object = tmp_object; + } + + if (object && object != orig_object) { + vm_object_unlock(object); + } + + return length; +} + + + +/* + * Move any resident pages in the specified range to the inactive queue. If kill_page is set, + * we also clear the modified status of the page and "forget" any changes that have been made + * to the page. + */ + +__private_extern__ void +vm_object_deactivate_pages( + vm_object_t object, + vm_object_offset_t offset, + vm_object_size_t size, + boolean_t kill_page, + boolean_t reusable_page, + struct pmap *pmap, + vm_map_offset_t pmap_offset) +{ + vm_object_size_t length; + boolean_t all_reusable; + pmap_flush_context pmap_flush_context_storage; + + /* + * We break the range up into chunks and do one chunk at a time. This is for + * efficiency and performance while handling the shadow chains and the locks. + * The deactivate_a_chunk() function returns how much of the range it processed. + * We keep calling this routine until the given size is exhausted. + */ + + + all_reusable = FALSE; +#if 11 + /* + * For the sake of accurate "reusable" pmap stats, we need + * to tell pmap about each page that is no longer "reusable", + * so we can't do the "all_reusable" optimization. + */ +#else + if (reusable_page && + object->internal && + object->vo_size != 0 && + object->vo_size == size && + object->reusable_page_count == 0) { + all_reusable = TRUE; + reusable_page = FALSE; + } +#endif + + if ((reusable_page || all_reusable) && object->all_reusable) { + /* This means MADV_FREE_REUSABLE has been called twice, which + * is probably illegal. */ + return; + } + + pmap_flush_context_init(&pmap_flush_context_storage); + + while (size) { + length = deactivate_a_chunk(object, offset, size, kill_page, reusable_page, all_reusable, &pmap_flush_context_storage, pmap, pmap_offset); + + size -= length; + offset += length; + pmap_offset += length; + } + pmap_flush(&pmap_flush_context_storage); + + if (all_reusable) { + if (!object->all_reusable) { + unsigned int reusable; + + object->all_reusable = TRUE; + assert(object->reusable_page_count == 0); + /* update global stats */ + reusable = object->resident_page_count; + OSAddAtomic(reusable, + &vm_page_stats_reusable.reusable_count); + vm_page_stats_reusable.reusable += reusable; + vm_page_stats_reusable.all_reusable_calls++; + } + } else if (reusable_page) { + vm_page_stats_reusable.partial_reusable_calls++; + } +} + +void +vm_object_reuse_pages( + vm_object_t object, + vm_object_offset_t start_offset, + vm_object_offset_t end_offset, + boolean_t allow_partial_reuse) +{ + vm_object_offset_t cur_offset; + vm_page_t m; + unsigned int reused, reusable; + +#define VM_OBJECT_REUSE_PAGE(object, m, reused) \ + MACRO_BEGIN \ + if ((m) != VM_PAGE_NULL && \ + (m)->vmp_reusable) { \ + assert((object)->reusable_page_count <= \ + (object)->resident_page_count); \ + assert((object)->reusable_page_count > 0); \ + (object)->reusable_page_count--; \ + (m)->vmp_reusable = FALSE; \ + (reused)++; \ + /* \ + * Tell pmap that this page is no longer \ + * "reusable", to update the "reusable" stats \ + * for all the pmaps that have mapped this \ + * page. \ + */ \ + pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE((m)), \ + 0, /* refmod */ \ + (PMAP_OPTIONS_CLEAR_REUSABLE \ + | PMAP_OPTIONS_NOFLUSH), \ + NULL); \ + } \ + MACRO_END + + reused = 0; + reusable = 0; + + vm_object_lock_assert_exclusive(object); + + if (object->all_reusable) { + panic("object %p all_reusable: can't update pmap stats\n", + object); + assert(object->reusable_page_count == 0); + object->all_reusable = FALSE; + if (end_offset - start_offset == object->vo_size || + !allow_partial_reuse) { + vm_page_stats_reusable.all_reuse_calls++; + reused = object->resident_page_count; + } else { + vm_page_stats_reusable.partial_reuse_calls++; + vm_page_queue_iterate(&object->memq, m, vmp_listq) { + if (m->vmp_offset < start_offset || + m->vmp_offset >= end_offset) { + m->vmp_reusable = TRUE; + object->reusable_page_count++; + assert(object->resident_page_count >= object->reusable_page_count); + continue; + } else { + assert(!m->vmp_reusable); + reused++; + } + } + } + } else if (object->resident_page_count > + ((end_offset - start_offset) >> PAGE_SHIFT)) { + vm_page_stats_reusable.partial_reuse_calls++; + for (cur_offset = start_offset; + cur_offset < end_offset; + cur_offset += PAGE_SIZE_64) { + if (object->reusable_page_count == 0) { + break; + } + m = vm_page_lookup(object, cur_offset); + VM_OBJECT_REUSE_PAGE(object, m, reused); + } + } else { + vm_page_stats_reusable.partial_reuse_calls++; + vm_page_queue_iterate(&object->memq, m, vmp_listq) { + if (object->reusable_page_count == 0) { + break; + } + if (m->vmp_offset < start_offset || + m->vmp_offset >= end_offset) { + continue; + } + VM_OBJECT_REUSE_PAGE(object, m, reused); + } + } + + /* update global stats */ + OSAddAtomic(reusable - reused, &vm_page_stats_reusable.reusable_count); + vm_page_stats_reusable.reused += reused; + vm_page_stats_reusable.reusable += reusable; } /* @@ -1530,7 +2630,7 @@ vm_object_deactivate_pages( * remove access to all pages in shadowed objects. * * The object must *not* be locked. The object must - * be temporary/internal. + * be internal. * * If pmap is not NULL, this routine assumes that * the only mappings for the pages are in that @@ -1539,123 +2639,196 @@ vm_object_deactivate_pages( __private_extern__ void vm_object_pmap_protect( - register vm_object_t object, - register vm_object_offset_t offset, - vm_object_size_t size, - pmap_t pmap, - vm_map_offset_t pmap_start, - vm_prot_t prot) + vm_object_t object, + vm_object_offset_t offset, + vm_object_size_t size, + pmap_t pmap, + vm_map_offset_t pmap_start, + vm_prot_t prot) +{ + vm_object_pmap_protect_options(object, offset, size, + pmap, pmap_start, prot, 0); +} + +__private_extern__ void +vm_object_pmap_protect_options( + vm_object_t object, + vm_object_offset_t offset, + vm_object_size_t size, + pmap_t pmap, + vm_map_offset_t pmap_start, + vm_prot_t prot, + int options) { - if (object == VM_OBJECT_NULL) - return; + pmap_flush_context pmap_flush_context_storage; + boolean_t delayed_pmap_flush = FALSE; + + if (object == VM_OBJECT_NULL) { + return; + } size = vm_object_round_page(size); offset = vm_object_trunc_page(offset); vm_object_lock(object); + if (object->phys_contiguous) { + if (pmap != NULL) { + vm_object_unlock(object); + pmap_protect_options(pmap, + pmap_start, + pmap_start + size, + prot, + options & ~PMAP_OPTIONS_NOFLUSH, + NULL); + } else { + vm_object_offset_t phys_start, phys_end, phys_addr; + + phys_start = object->vo_shadow_offset + offset; + phys_end = phys_start + size; + assert(phys_start <= phys_end); + assert(phys_end <= object->vo_shadow_offset + object->vo_size); + vm_object_unlock(object); + + pmap_flush_context_init(&pmap_flush_context_storage); + delayed_pmap_flush = FALSE; + + for (phys_addr = phys_start; + phys_addr < phys_end; + phys_addr += PAGE_SIZE_64) { + pmap_page_protect_options( + (ppnum_t) (phys_addr >> PAGE_SHIFT), + prot, + options | PMAP_OPTIONS_NOFLUSH, + (void *)&pmap_flush_context_storage); + delayed_pmap_flush = TRUE; + } + if (delayed_pmap_flush == TRUE) { + pmap_flush(&pmap_flush_context_storage); + } + } + return; + } + assert(object->internal); while (TRUE) { - if (ptoa_64(object->resident_page_count) > size/2 && pmap != PMAP_NULL) { - vm_object_unlock(object); - pmap_protect(pmap, pmap_start, pmap_start + size, prot); - return; - } - - /* if we are doing large ranges with respect to resident */ - /* page count then we should interate over pages otherwise */ - /* inverse page look-up will be faster */ - if (ptoa_64(object->resident_page_count / 4) < size) { - vm_page_t p; - vm_object_offset_t end; - - end = offset + size; - - if (pmap != PMAP_NULL) { - queue_iterate(&object->memq, p, vm_page_t, listq) { - if (!p->fictitious && - (offset <= p->offset) && (p->offset < end)) { - vm_map_offset_t start; - - start = pmap_start + p->offset - offset; - pmap_protect(pmap, start, start + PAGE_SIZE_64, prot); - } - } - } else { - queue_iterate(&object->memq, p, vm_page_t, listq) { - if (!p->fictitious && - (offset <= p->offset) && (p->offset < end)) { - - pmap_page_protect(p->phys_page, - prot & ~p->page_lock); - } - } - } - } else { - vm_page_t p; - vm_object_offset_t end; - vm_object_offset_t target_off; - - end = offset + size; - - if (pmap != PMAP_NULL) { - for(target_off = offset; - target_off < end; - target_off += PAGE_SIZE) { - p = vm_page_lookup(object, target_off); - if (p != VM_PAGE_NULL) { - vm_offset_t start; - start = pmap_start + - (vm_offset_t)(p->offset - offset); - pmap_protect(pmap, start, - start + PAGE_SIZE, prot); + if (ptoa_64(object->resident_page_count) > size / 2 && pmap != PMAP_NULL) { + vm_object_unlock(object); + pmap_protect_options(pmap, pmap_start, pmap_start + size, prot, + options & ~PMAP_OPTIONS_NOFLUSH, NULL); + return; + } + + pmap_flush_context_init(&pmap_flush_context_storage); + delayed_pmap_flush = FALSE; + + /* + * if we are doing large ranges with respect to resident + * page count then we should interate over pages otherwise + * inverse page look-up will be faster + */ + if (ptoa_64(object->resident_page_count / 4) < size) { + vm_page_t p; + vm_object_offset_t end; + + end = offset + size; + + vm_page_queue_iterate(&object->memq, p, vmp_listq) { + if (!p->vmp_fictitious && (offset <= p->vmp_offset) && (p->vmp_offset < end)) { + vm_map_offset_t start; + + start = pmap_start + p->vmp_offset - offset; + + if (pmap != PMAP_NULL) { + pmap_protect_options( + pmap, + start, + start + PAGE_SIZE_64, + prot, + options | PMAP_OPTIONS_NOFLUSH, + &pmap_flush_context_storage); + } else { + pmap_page_protect_options( + VM_PAGE_GET_PHYS_PAGE(p), + prot, + options | PMAP_OPTIONS_NOFLUSH, + &pmap_flush_context_storage); + } + delayed_pmap_flush = TRUE; } - } + } } else { - for(target_off = offset; - target_off < end; target_off += PAGE_SIZE) { + vm_page_t p; + vm_object_offset_t end; + vm_object_offset_t target_off; + + end = offset + size; + + for (target_off = offset; + target_off < end; target_off += PAGE_SIZE) { p = vm_page_lookup(object, target_off); + if (p != VM_PAGE_NULL) { - pmap_page_protect(p->phys_page, - prot & ~p->page_lock); + vm_object_offset_t start; + + start = pmap_start + (p->vmp_offset - offset); + + if (pmap != PMAP_NULL) { + pmap_protect_options( + pmap, + start, + start + PAGE_SIZE_64, + prot, + options | PMAP_OPTIONS_NOFLUSH, + &pmap_flush_context_storage); + } else { + pmap_page_protect_options( + VM_PAGE_GET_PHYS_PAGE(p), + prot, + options | PMAP_OPTIONS_NOFLUSH, + &pmap_flush_context_storage); + } + delayed_pmap_flush = TRUE; } - } + } + } + if (delayed_pmap_flush == TRUE) { + pmap_flush(&pmap_flush_context_storage); } - } - if (prot == VM_PROT_NONE) { - /* - * Must follow shadow chain to remove access - * to pages in shadowed objects. - */ - register vm_object_t next_object; - - next_object = object->shadow; - if (next_object != VM_OBJECT_NULL) { - offset += object->shadow_offset; - vm_object_lock(next_object); - vm_object_unlock(object); - object = next_object; - } - else { - /* - * End of chain - we are done. - */ - break; - } - } - else { - /* - * Pages in shadowed objects may never have - * write permission - we may stop here. - */ - break; - } + if (prot == VM_PROT_NONE) { + /* + * Must follow shadow chain to remove access + * to pages in shadowed objects. + */ + vm_object_t next_object; + + next_object = object->shadow; + if (next_object != VM_OBJECT_NULL) { + offset += object->vo_shadow_offset; + vm_object_lock(next_object); + vm_object_unlock(object); + object = next_object; + } else { + /* + * End of chain - we are done. + */ + break; + } + } else { + /* + * Pages in shadowed objects may never have + * write permission - we may stop here. + */ + break; + } } vm_object_unlock(object); } +uint32_t vm_page_busy_absent_skipped = 0; + /* * Routine: vm_object_copy_slowly * @@ -1688,34 +2861,28 @@ vm_object_pmap_protect( */ __private_extern__ kern_return_t vm_object_copy_slowly( - register vm_object_t src_object, - vm_object_offset_t src_offset, - vm_object_size_t size, - boolean_t interruptible, - vm_object_t *_result_object) /* OUT */ + vm_object_t src_object, + vm_object_offset_t src_offset, + vm_object_size_t size, + boolean_t interruptible, + vm_object_t *_result_object) /* OUT */ { - vm_object_t new_object; - vm_object_offset_t new_offset; + vm_object_t new_object; + vm_object_offset_t new_offset; - vm_object_offset_t src_lo_offset = src_offset; - vm_object_offset_t src_hi_offset = src_offset + size; - - XPR(XPR_VM_OBJECT, "v_o_c_slowly obj 0x%x off 0x%x size 0x%x\n", - src_object, src_offset, size, 0, 0); + struct vm_object_fault_info fault_info = {}; if (size == 0) { vm_object_unlock(src_object); *_result_object = VM_OBJECT_NULL; - return(KERN_INVALID_ARGUMENT); + return KERN_INVALID_ARGUMENT; } /* * Prevent destruction of the source object while we copy. */ - assert(src_object->ref_count > 0); - src_object->ref_count++; - VM_OBJ_RES_INCR(src_object); + vm_object_reference_locked(src_object); vm_object_unlock(src_object); /* @@ -1729,135 +2896,223 @@ vm_object_copy_slowly( new_object = vm_object_allocate(size); new_offset = 0; - vm_object_lock(new_object); - assert(size == trunc_page_64(size)); /* Will the loop terminate? */ + assert(size == trunc_page_64(size)); /* Will the loop terminate? */ - for ( ; - size != 0 ; - src_offset += PAGE_SIZE_64, - new_offset += PAGE_SIZE_64, size -= PAGE_SIZE_64 + fault_info.interruptible = interruptible; + fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; + fault_info.lo_offset = src_offset; + fault_info.hi_offset = src_offset + size; + fault_info.stealth = TRUE; + + for (; + size != 0; + src_offset += PAGE_SIZE_64, + new_offset += PAGE_SIZE_64, size -= PAGE_SIZE_64 ) { - vm_page_t new_page; + vm_page_t new_page; vm_fault_return_t result; + vm_object_lock(new_object); + while ((new_page = vm_page_alloc(new_object, new_offset)) - == VM_PAGE_NULL) { + == VM_PAGE_NULL) { + vm_object_unlock(new_object); + if (!vm_page_wait(interruptible)) { - vm_object_unlock(new_object); vm_object_deallocate(new_object); vm_object_deallocate(src_object); *_result_object = VM_OBJECT_NULL; - return(MACH_SEND_INTERRUPTED); + return MACH_SEND_INTERRUPTED; } + vm_object_lock(new_object); } + vm_object_unlock(new_object); do { - vm_prot_t prot = VM_PROT_READ; - vm_page_t _result_page; - vm_page_t top_page; - register - vm_page_t result_page; - kern_return_t error_code; + vm_prot_t prot = VM_PROT_READ; + vm_page_t _result_page; + vm_page_t top_page; + vm_page_t result_page; + kern_return_t error_code; + vm_object_t result_page_object; + vm_object_lock(src_object); + + if (src_object->internal && + src_object->shadow == VM_OBJECT_NULL && + (src_object->pager == NULL || + (VM_COMPRESSOR_PAGER_STATE_GET(src_object, + src_offset) == + VM_EXTERNAL_STATE_ABSENT))) { + boolean_t can_skip_page; + + _result_page = vm_page_lookup(src_object, + src_offset); + if (_result_page == VM_PAGE_NULL) { + /* + * This page is neither resident nor + * compressed and there's no shadow + * object below "src_object", so this + * page is really missing. + * There's no need to zero-fill it just + * to copy it: let's leave it missing + * in "new_object" and get zero-filled + * on demand. + */ + can_skip_page = TRUE; + } else if (workaround_41447923 && + src_object->pager == NULL && + _result_page != VM_PAGE_NULL && + _result_page->vmp_busy && + _result_page->vmp_absent && + src_object->purgable == VM_PURGABLE_DENY && + !src_object->blocked_access) { + /* + * This page is "busy" and "absent" + * but not because we're waiting for + * it to be decompressed. It must + * be because it's a "no zero fill" + * page that is currently not + * accessible until it gets overwritten + * by a device driver. + * Since its initial state would have + * been "zero-filled", let's leave the + * copy page missing and get zero-filled + * on demand. + */ + assert(src_object->internal); + assert(src_object->shadow == NULL); + assert(src_object->pager == NULL); + can_skip_page = TRUE; + vm_page_busy_absent_skipped++; + } else { + can_skip_page = FALSE; + } + if (can_skip_page) { + vm_object_unlock(src_object); + /* free the unused "new_page"... */ + vm_object_lock(new_object); + VM_PAGE_FREE(new_page); + new_page = VM_PAGE_NULL; + vm_object_unlock(new_object); + /* ...and go to next page in "src_object" */ + result = VM_FAULT_SUCCESS; + break; + } + } + vm_object_paging_begin(src_object); - XPR(XPR_VM_FAULT,"vm_object_copy_slowly -> vm_fault_page",0,0,0,0,0); + /* cap size at maximum UPL size */ + upl_size_t cluster_size; + if (os_convert_overflow(size, &cluster_size)) { + cluster_size = 0 - (upl_size_t)PAGE_SIZE; + } + fault_info.cluster_size = cluster_size; + + _result_page = VM_PAGE_NULL; result = vm_fault_page(src_object, src_offset, - VM_PROT_READ, FALSE, interruptible, - src_lo_offset, src_hi_offset, - VM_BEHAVIOR_SEQUENTIAL, - &prot, &_result_page, &top_page, - (int *)0, - &error_code, FALSE, FALSE, NULL, 0); + VM_PROT_READ, FALSE, + FALSE, /* page not looked up */ + &prot, &_result_page, &top_page, + (int *)0, + &error_code, FALSE, FALSE, &fault_info); - switch(result) { - case VM_FAULT_SUCCESS: - result_page = _result_page; + switch (result) { + case VM_FAULT_SUCCESS: + result_page = _result_page; + result_page_object = VM_PAGE_OBJECT(result_page); - /* - * We don't need to hold the object - * lock -- the busy page will be enough. - * [We don't care about picking up any - * new modifications.] - * - * Copy the page to the new object. - * - * POLICY DECISION: - * If result_page is clean, - * we could steal it instead - * of copying. - */ + /* + * Copy the page to the new object. + * + * POLICY DECISION: + * If result_page is clean, + * we could steal it instead + * of copying. + */ - vm_object_unlock(result_page->object); - vm_page_copy(result_page, new_page); + vm_page_copy(result_page, new_page); + vm_object_unlock(result_page_object); - /* - * Let go of both pages (make them - * not busy, perform wakeup, activate). - */ + /* + * Let go of both pages (make them + * not busy, perform wakeup, activate). + */ + vm_object_lock(new_object); + SET_PAGE_DIRTY(new_page, FALSE); + PAGE_WAKEUP_DONE(new_page); + vm_object_unlock(new_object); + + vm_object_lock(result_page_object); + PAGE_WAKEUP_DONE(result_page); + + vm_page_lockspin_queues(); + if ((result_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) || + (result_page->vmp_q_state == VM_PAGE_NOT_ON_Q)) { + vm_page_activate(result_page); + } + vm_page_activate(new_page); + vm_page_unlock_queues(); - new_page->busy = FALSE; - new_page->dirty = TRUE; - vm_object_lock(result_page->object); - PAGE_WAKEUP_DONE(result_page); + /* + * Release paging references and + * top-level placeholder page, if any. + */ - vm_page_lock_queues(); - if (!result_page->active && - !result_page->inactive) - vm_page_activate(result_page); - vm_page_activate(new_page); - vm_page_unlock_queues(); + vm_fault_cleanup(result_page_object, + top_page); - /* - * Release paging references and - * top-level placeholder page, if any. - */ + break; - vm_fault_cleanup(result_page->object, - top_page); + case VM_FAULT_RETRY: + break; + case VM_FAULT_MEMORY_SHORTAGE: + if (vm_page_wait(interruptible)) { break; - - case VM_FAULT_RETRY: - break; + } + /* fall thru */ - case VM_FAULT_FICTITIOUS_SHORTAGE: - vm_page_more_fictitious(); - break; + case VM_FAULT_INTERRUPTED: + vm_object_lock(new_object); + VM_PAGE_FREE(new_page); + vm_object_unlock(new_object); - case VM_FAULT_MEMORY_SHORTAGE: - if (vm_page_wait(interruptible)) - break; - /* fall thru */ + vm_object_deallocate(new_object); + vm_object_deallocate(src_object); + *_result_object = VM_OBJECT_NULL; + return MACH_SEND_INTERRUPTED; - case VM_FAULT_INTERRUPTED: - vm_page_free(new_page); - vm_object_unlock(new_object); - vm_object_deallocate(new_object); - vm_object_deallocate(src_object); - *_result_object = VM_OBJECT_NULL; - return(MACH_SEND_INTERRUPTED); + case VM_FAULT_SUCCESS_NO_VM_PAGE: + /* success but no VM page: fail */ + vm_object_paging_end(src_object); + vm_object_unlock(src_object); + /*FALLTHROUGH*/ + case VM_FAULT_MEMORY_ERROR: + /* + * A policy choice: + * (a) ignore pages that we can't + * copy + * (b) return the null object if + * any page fails [chosen] + */ - case VM_FAULT_MEMORY_ERROR: - /* - * A policy choice: - * (a) ignore pages that we can't - * copy - * (b) return the null object if - * any page fails [chosen] - */ + vm_object_lock(new_object); + VM_PAGE_FREE(new_page); + vm_object_unlock(new_object); - vm_page_lock_queues(); - vm_page_free(new_page); - vm_page_unlock_queues(); - vm_object_unlock(new_object); - vm_object_deallocate(new_object); - vm_object_deallocate(src_object); - *_result_object = VM_OBJECT_NULL; - return(error_code ? error_code: - KERN_MEMORY_ERROR); + vm_object_deallocate(new_object); + vm_object_deallocate(src_object); + *_result_object = VM_OBJECT_NULL; + return error_code ? error_code: + KERN_MEMORY_ERROR; + + default: + panic("vm_object_copy_slowly: unexpected error" + " 0x%x from vm_fault_page()\n", result); } } while (result != VM_FAULT_SUCCESS); } @@ -1865,11 +3120,9 @@ vm_object_copy_slowly( /* * Lose the extra reference, and return our object. */ - - vm_object_unlock(new_object); vm_object_deallocate(src_object); *_result_object = new_object; - return(KERN_SUCCESS); + return KERN_SUCCESS; } /* @@ -1892,21 +3145,19 @@ vm_object_copy_slowly( /*ARGSUSED*/ __private_extern__ boolean_t vm_object_copy_quickly( - vm_object_t *_object, /* INOUT */ - __unused vm_object_offset_t offset, /* IN */ - __unused vm_object_size_t size, /* IN */ - boolean_t *_src_needs_copy, /* OUT */ - boolean_t *_dst_needs_copy) /* OUT */ + vm_object_t *_object, /* INOUT */ + __unused vm_object_offset_t offset, /* IN */ + __unused vm_object_size_t size, /* IN */ + boolean_t *_src_needs_copy, /* OUT */ + boolean_t *_dst_needs_copy) /* OUT */ { - vm_object_t object = *_object; + vm_object_t object = *_object; memory_object_copy_strategy_t copy_strategy; - XPR(XPR_VM_OBJECT, "v_o_c_quickly obj 0x%x off 0x%x size 0x%x\n", - *_object, offset, size, 0, 0); if (object == VM_OBJECT_NULL) { *_src_needs_copy = FALSE; *_dst_needs_copy = FALSE; - return(TRUE); + return TRUE; } vm_object_lock(object); @@ -1922,9 +3173,7 @@ vm_object_copy_quickly( * Leave object/offset unchanged. */ - assert(object->ref_count > 0); - object->ref_count++; - vm_object_res_reference(object); + vm_object_reference_locked(object); object->shadowed = TRUE; vm_object_unlock(object); @@ -1941,13 +3190,13 @@ vm_object_copy_quickly( case MEMORY_OBJECT_COPY_DELAY: vm_object_unlock(object); - return(FALSE); + return FALSE; default: vm_object_unlock(object); - return(FALSE); + return FALSE; } - return(TRUE); + return TRUE; } static int copy_call_count = 0; @@ -1974,14 +3223,15 @@ static int copy_call_restart_count = 0; */ static kern_return_t vm_object_copy_call( - vm_object_t src_object, - vm_object_offset_t src_offset, - vm_object_size_t size, - vm_object_t *_result_object) /* OUT */ + vm_object_t src_object, + vm_object_offset_t src_offset, + vm_object_size_t size, + vm_object_t *_result_object) /* OUT */ { - kern_return_t kr; - vm_object_t copy; - boolean_t check_ready = FALSE; + kern_return_t kr; + vm_object_t copy; + boolean_t check_ready = FALSE; + uint32_t try_failed_count = 0; /* * If a copy is already in progress, wait and retry. @@ -1999,7 +3249,7 @@ vm_object_copy_call( copy_call_count++; while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) { vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL, - THREAD_UNINT); + THREAD_UNINT); copy_call_restart_count++; } @@ -2022,7 +3272,7 @@ vm_object_copy_call( * via memory_object_create_copy. */ - kr = KERN_FAILURE; /* XXX need to change memory_object.defs */ + kr = KERN_FAILURE; /* XXX need to change memory_object.defs */ if (kr != KERN_SUCCESS) { return kr; } @@ -2033,7 +3283,7 @@ vm_object_copy_call( vm_object_lock(src_object); while (vm_object_wanted(src_object, VM_OBJECT_EVENT_COPY_CALL)) { vm_object_sleep(src_object, VM_OBJECT_EVENT_COPY_CALL, - THREAD_UNINT); + THREAD_UNINT); copy_call_sleep_count++; } Retry: @@ -2041,15 +3291,20 @@ Retry: copy = src_object->copy; if (!vm_object_lock_try(copy)) { vm_object_unlock(src_object); - mutex_pause(); /* wait a bit */ + + try_failed_count++; + mutex_pause(try_failed_count); /* wait a bit */ + vm_object_lock(src_object); goto Retry; } - if (copy->size < src_offset+size) - copy->size = src_offset+size; + if (copy->vo_size < src_offset + size) { + copy->vo_size = src_offset + size; + } - if (!copy->pager_ready) + if (!copy->pager_ready) { check_ready = TRUE; + } /* * Return the copy. @@ -2091,20 +3346,24 @@ static int copy_delayed_protect_iterate = 0; */ __private_extern__ vm_object_t vm_object_copy_delayed( - vm_object_t src_object, - vm_object_offset_t src_offset, - vm_object_size_t size) + vm_object_t src_object, + vm_object_offset_t src_offset, + vm_object_size_t size, + boolean_t src_object_shared) { - vm_object_t new_copy = VM_OBJECT_NULL; - vm_object_t old_copy; - vm_page_t p; - vm_object_size_t copy_size = src_offset + size; + vm_object_t new_copy = VM_OBJECT_NULL; + vm_object_t old_copy; + vm_page_t p; + vm_object_size_t copy_size = src_offset + size; + pmap_flush_context pmap_flush_context_storage; + boolean_t delayed_pmap_flush = FALSE; + int collisions = 0; /* * The user-level memory manager wants to see all of the changes * to this object, but it has promised not to make any changes on - * its own. + * its own. * * Perform an asymmetric copy-on-write, as follows: * Create a new object, called a "copy object" to hold @@ -2140,14 +3399,23 @@ vm_object_copy_delayed( * the original object must be done carefully, to avoid deadlock. */ - Retry: - + copy_size = vm_object_round_page(copy_size); +Retry: + /* * Wait for paging in progress. */ - if (!src_object->true_share) + if (!src_object->true_share && + (src_object->paging_in_progress != 0 || + src_object->activity_in_progress != 0)) { + if (src_object_shared == TRUE) { + vm_object_unlock(src_object); + vm_object_lock(src_object); + src_object_shared = FALSE; + goto Retry; + } vm_object_paging_wait(src_object, THREAD_UNINT); - + } /* * See whether we can reuse the result of a previous * copy operation. @@ -2155,22 +3423,38 @@ vm_object_copy_delayed( old_copy = src_object->copy; if (old_copy != VM_OBJECT_NULL) { + int lock_granted; + /* * Try to get the locks (out of order) */ - if (!vm_object_lock_try(old_copy)) { + if (src_object_shared == TRUE) { + lock_granted = vm_object_lock_try_shared(old_copy); + } else { + lock_granted = vm_object_lock_try(old_copy); + } + + if (!lock_granted) { vm_object_unlock(src_object); - mutex_pause(); + + if (collisions++ == 0) { + copy_delayed_lock_contention++; + } + mutex_pause(collisions); /* Heisenberg Rules */ copy_delayed_lock_collisions++; - if (collisions++ == 0) - copy_delayed_lock_contention++; - if (collisions > copy_delayed_max_collisions) + if (collisions > copy_delayed_max_collisions) { copy_delayed_max_collisions = collisions; + } + + if (src_object_shared == TRUE) { + vm_object_lock_shared(src_object); + } else { + vm_object_lock(src_object); + } - vm_object_lock(src_object); goto Retry; } @@ -2190,7 +3474,15 @@ vm_object_copy_delayed( * needed). */ - if (old_copy->size < copy_size) { + if (old_copy->vo_size < copy_size) { + if (src_object_shared == TRUE) { + vm_object_unlock(old_copy); + vm_object_unlock(src_object); + + vm_object_lock(src_object); + src_object_shared = FALSE; + goto Retry; + } /* * We can't perform a delayed copy if any of the * pages in the extended range are wired (because @@ -2199,11 +3491,15 @@ vm_object_copy_delayed( * go ahead and protect them. */ copy_delayed_protect_iterate++; - queue_iterate(&src_object->memq, p, vm_page_t, listq) { - if (!p->fictitious && - p->offset >= old_copy->size && - p->offset < copy_size) { - if (p->wire_count > 0) { + + pmap_flush_context_init(&pmap_flush_context_storage); + delayed_pmap_flush = FALSE; + + vm_page_queue_iterate(&src_object->memq, p, vmp_listq) { + if (!p->vmp_fictitious && + p->vmp_offset >= old_copy->vo_size && + p->vmp_offset < copy_size) { + if (VM_PAGE_WIRED(p)) { vm_object_unlock(old_copy); vm_object_unlock(src_object); @@ -2211,19 +3507,29 @@ vm_object_copy_delayed( vm_object_unlock(new_copy); vm_object_deallocate(new_copy); } + if (delayed_pmap_flush == TRUE) { + pmap_flush(&pmap_flush_context_storage); + } return VM_OBJECT_NULL; } else { - pmap_page_protect(p->phys_page, - (VM_PROT_ALL & ~VM_PROT_WRITE & - ~p->page_lock)); + pmap_page_protect_options(VM_PAGE_GET_PHYS_PAGE(p), (VM_PROT_ALL & ~VM_PROT_WRITE), + PMAP_OPTIONS_NOFLUSH, (void *)&pmap_flush_context_storage); + delayed_pmap_flush = TRUE; } } } - old_copy->size = copy_size; + if (delayed_pmap_flush == TRUE) { + pmap_flush(&pmap_flush_context_storage); + } + + old_copy->vo_size = copy_size; + } + if (src_object_shared == TRUE) { + vm_object_reference_shared(old_copy); + } else { + vm_object_reference_locked(old_copy); } - - vm_object_reference_locked(old_copy); vm_object_unlock(old_copy); vm_object_unlock(src_object); @@ -2231,17 +3537,19 @@ vm_object_copy_delayed( vm_object_unlock(new_copy); vm_object_deallocate(new_copy); } - - return(old_copy); + return old_copy; } + + /* - * Adjust the size argument so that the newly-created + * Adjust the size argument so that the newly-created * copy object will be large enough to back either the * old copy object or the new mapping. */ - if (old_copy->size > copy_size) - copy_size = old_copy->size; + if (old_copy->vo_size > copy_size) { + copy_size = old_copy->vo_size; + } if (new_copy == VM_OBJECT_NULL) { vm_object_unlock(old_copy); @@ -2249,9 +3557,11 @@ vm_object_copy_delayed( new_copy = vm_object_allocate(copy_size); vm_object_lock(src_object); vm_object_lock(new_copy); + + src_object_shared = FALSE; goto Retry; } - new_copy->size = copy_size; + new_copy->vo_size = copy_size; /* * The copy-object is always made large enough to @@ -2261,13 +3571,14 @@ vm_object_copy_delayed( */ assert((old_copy->shadow == src_object) && - (old_copy->shadow_offset == (vm_object_offset_t) 0)); - + (old_copy->vo_shadow_offset == (vm_object_offset_t) 0)); } else if (new_copy == VM_OBJECT_NULL) { vm_object_unlock(src_object); new_copy = vm_object_allocate(copy_size); vm_object_lock(src_object); vm_object_lock(new_copy); + + src_object_shared = FALSE; goto Retry; } @@ -2283,22 +3594,35 @@ vm_object_copy_delayed( * wired, then go ahead and protect them. */ copy_delayed_protect_iterate++; - queue_iterate(&src_object->memq, p, vm_page_t, listq) { - if (!p->fictitious && p->offset < copy_size) { - if (p->wire_count > 0) { - if (old_copy) + + pmap_flush_context_init(&pmap_flush_context_storage); + delayed_pmap_flush = FALSE; + + vm_page_queue_iterate(&src_object->memq, p, vmp_listq) { + if (!p->vmp_fictitious && p->vmp_offset < copy_size) { + if (VM_PAGE_WIRED(p)) { + if (old_copy) { vm_object_unlock(old_copy); + } vm_object_unlock(src_object); vm_object_unlock(new_copy); vm_object_deallocate(new_copy); + + if (delayed_pmap_flush == TRUE) { + pmap_flush(&pmap_flush_context_storage); + } + return VM_OBJECT_NULL; } else { - pmap_page_protect(p->phys_page, - (VM_PROT_ALL & ~VM_PROT_WRITE & - ~p->page_lock)); + pmap_page_protect_options(VM_PAGE_GET_PHYS_PAGE(p), (VM_PROT_ALL & ~VM_PROT_WRITE), + PMAP_OPTIONS_NOFLUSH, (void *)&pmap_flush_context_storage); + delayed_pmap_flush = TRUE; } } } + if (delayed_pmap_flush == TRUE) { + pmap_flush(&pmap_flush_context_storage); + } if (old_copy != VM_OBJECT_NULL) { /* @@ -2307,11 +3631,15 @@ vm_object_copy_delayed( * object. */ - src_object->ref_count--; /* remove ref. from old_copy */ + /* remove ref. from old_copy */ + vm_object_lock_assert_exclusive(src_object); + src_object->ref_count--; assert(src_object->ref_count > 0); + vm_object_lock_assert_exclusive(old_copy); old_copy->shadow = new_copy; + vm_object_lock_assert_exclusive(new_copy); assert(new_copy->ref_count > 0); - new_copy->ref_count++; /* for old_copy->shadow ref. */ + new_copy->ref_count++; /* for old_copy->shadow ref. */ #if TASK_SWAPPER if (old_copy->res_count) { @@ -2320,27 +3648,24 @@ vm_object_copy_delayed( } #endif - vm_object_unlock(old_copy); /* done with old_copy */ + vm_object_unlock(old_copy); /* done with old_copy */ } /* * Point the new copy at the existing object. */ + vm_object_lock_assert_exclusive(new_copy); new_copy->shadow = src_object; - new_copy->shadow_offset = 0; - new_copy->shadowed = TRUE; /* caller must set needs_copy */ - assert(src_object->ref_count > 0); - src_object->ref_count++; - VM_OBJ_RES_INCR(src_object); + new_copy->vo_shadow_offset = 0; + new_copy->shadowed = TRUE; /* caller must set needs_copy */ + + vm_object_lock_assert_exclusive(src_object); + vm_object_reference_locked(src_object); src_object->copy = new_copy; vm_object_unlock(src_object); vm_object_unlock(new_copy); - XPR(XPR_VM_OBJECT, - "vm_object_copy_delayed: used copy object %X for source %X\n", - (integer_t)new_copy, (integer_t)src_object, 0, 0, 0); - - return(new_copy); + return new_copy; } /* @@ -2353,20 +3678,28 @@ vm_object_copy_delayed( */ __private_extern__ kern_return_t vm_object_copy_strategically( - register vm_object_t src_object, - vm_object_offset_t src_offset, - vm_object_size_t size, - vm_object_t *dst_object, /* OUT */ - vm_object_offset_t *dst_offset, /* OUT */ - boolean_t *dst_needs_copy) /* OUT */ + vm_object_t src_object, + vm_object_offset_t src_offset, + vm_object_size_t size, + vm_object_t *dst_object, /* OUT */ + vm_object_offset_t *dst_offset, /* OUT */ + boolean_t *dst_needs_copy) /* OUT */ { - boolean_t result; - boolean_t interruptible = THREAD_ABORTSAFE; /* XXX */ + boolean_t result; + boolean_t interruptible = THREAD_ABORTSAFE; /* XXX */ + boolean_t object_lock_shared = FALSE; memory_object_copy_strategy_t copy_strategy; assert(src_object != VM_OBJECT_NULL); - vm_object_lock(src_object); + copy_strategy = src_object->copy_strategy; + + if (copy_strategy == MEMORY_OBJECT_COPY_DELAY) { + vm_object_lock_shared(src_object); + object_lock_shared = TRUE; + } else { + vm_object_lock(src_object); + } /* * The copy strategy is only valid if the memory manager @@ -2376,28 +3709,32 @@ vm_object_copy_strategically( while (!src_object->internal && !src_object->pager_ready) { wait_result_t wait_result; - wait_result = vm_object_sleep( src_object, - VM_OBJECT_EVENT_PAGER_READY, - interruptible); + if (object_lock_shared == TRUE) { + vm_object_unlock(src_object); + vm_object_lock(src_object); + object_lock_shared = FALSE; + continue; + } + wait_result = vm_object_sleep( src_object, + VM_OBJECT_EVENT_PAGER_READY, + interruptible); if (wait_result != THREAD_AWAKENED) { vm_object_unlock(src_object); *dst_object = VM_OBJECT_NULL; *dst_offset = 0; *dst_needs_copy = FALSE; - return(MACH_SEND_INTERRUPTED); + return MACH_SEND_INTERRUPTED; } } - copy_strategy = src_object->copy_strategy; - /* * Use the appropriate copy strategy. */ switch (copy_strategy) { - case MEMORY_OBJECT_COPY_DELAY: + case MEMORY_OBJECT_COPY_DELAY: *dst_object = vm_object_copy_delayed(src_object, - src_offset, size); + src_offset, size, object_lock_shared); if (*dst_object != VM_OBJECT_NULL) { *dst_offset = src_offset; *dst_needs_copy = TRUE; @@ -2405,37 +3742,36 @@ vm_object_copy_strategically( break; } vm_object_lock(src_object); - /* fall thru when delayed copy not allowed */ + /* fall thru when delayed copy not allowed */ - case MEMORY_OBJECT_COPY_NONE: + case MEMORY_OBJECT_COPY_NONE: result = vm_object_copy_slowly(src_object, src_offset, size, - interruptible, dst_object); + interruptible, dst_object); if (result == KERN_SUCCESS) { *dst_offset = 0; *dst_needs_copy = FALSE; } break; - case MEMORY_OBJECT_COPY_CALL: + case MEMORY_OBJECT_COPY_CALL: result = vm_object_copy_call(src_object, src_offset, size, - dst_object); + dst_object); if (result == KERN_SUCCESS) { *dst_offset = src_offset; *dst_needs_copy = TRUE; } break; - case MEMORY_OBJECT_COPY_SYMMETRIC: - XPR(XPR_VM_OBJECT, "v_o_c_strategically obj 0x%x off 0x%x size 0x%x\n",(natural_t)src_object, src_offset, size, 0, 0); + case MEMORY_OBJECT_COPY_SYMMETRIC: vm_object_unlock(src_object); result = KERN_MEMORY_RESTART_COPY; break; - default: + default: panic("copy_strategically: bad strategy"); result = KERN_INVALID_ARGUMENT; } - return(result); + return result; } /* @@ -2448,38 +3784,86 @@ vm_object_copy_strategically( * The new object and offset into that object * are returned in the source parameters. */ -boolean_t vm_object_shadow_check = FALSE; +boolean_t vm_object_shadow_check = TRUE; __private_extern__ boolean_t vm_object_shadow( - vm_object_t *object, /* IN/OUT */ - vm_object_offset_t *offset, /* IN/OUT */ - vm_object_size_t length) + vm_object_t *object, /* IN/OUT */ + vm_object_offset_t *offset, /* IN/OUT */ + vm_object_size_t length) { - register vm_object_t source; - register vm_object_t result; + vm_object_t source; + vm_object_t result; source = *object; + assert(source != VM_OBJECT_NULL); + if (source == VM_OBJECT_NULL) { + return FALSE; + } + +#if 0 + /* + * XXX FBDP + * This assertion is valid but it gets triggered by Rosetta for example + * due to a combination of vm_remap() that changes a VM object's + * copy_strategy from SYMMETRIC to DELAY and vm_protect(VM_PROT_COPY) + * that then sets "needs_copy" on its map entry. This creates a + * mapping situation that VM should never see and doesn't know how to + * handle. + * It's not clear if this can create any real problem but we should + * look into fixing this, probably by having vm_protect(VM_PROT_COPY) + * do more than just set "needs_copy" to handle the copy-on-write... + * In the meantime, let's disable the assertion. + */ assert(source->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC); +#endif /* * Determine if we really need a shadow. + * + * If the source object is larger than what we are trying + * to create, then force the shadow creation even if the + * ref count is 1. This will allow us to [potentially] + * collapse the underlying object away in the future + * (freeing up the extra data it might contain and that + * we don't need). */ - if (vm_object_shadow_check && source->ref_count == 1 && - (source->shadow == VM_OBJECT_NULL || - source->shadow->copy == VM_OBJECT_NULL)) - { - source->shadowed = FALSE; - return FALSE; + assert(source->copy_strategy != MEMORY_OBJECT_COPY_NONE); /* Purgeable objects shouldn't have shadow objects. */ + + if (vm_object_shadow_check && + source->vo_size == length && + source->ref_count == 1) { + /* + * Lock the object and check again. + * We also check to see if there's + * a shadow or copy object involved. + * We can't do that earlier because + * without the object locked, there + * could be a collapse and the chain + * gets modified leaving us with an + * invalid pointer. + */ + vm_object_lock(source); + if (source->vo_size == length && + source->ref_count == 1 && + (source->shadow == VM_OBJECT_NULL || + source->shadow->copy == VM_OBJECT_NULL)) { + source->shadowed = FALSE; + vm_object_unlock(source); + return FALSE; + } + /* things changed while we were locking "source"... */ + vm_object_unlock(source); } /* * Allocate a new object with the given length */ - if ((result = vm_object_allocate(length)) == VM_OBJECT_NULL) + if ((result = vm_object_allocate(length)) == VM_OBJECT_NULL) { panic("vm_object_shadow: no object for shadowing"); + } /* * The new object shadows the source object, adding @@ -2489,13 +3873,13 @@ vm_object_shadow( * count. */ result->shadow = source; - + /* * Store the offset into the source object, * and fix up the offset into the new object. */ - result->shadow_offset = *offset; + result->vo_shadow_offset = *offset; /* * Return the new things @@ -2511,7 +3895,7 @@ vm_object_shadow( * the memory_object requires careful synchronization. * * All associations are created by memory_object_create_named - * for external pagers and vm_object_pager_create for internal + * for external pagers and vm_object_compressor_pager_create for internal * objects as follows: * * pager: the memory_object itself, supplied by @@ -2565,10 +3949,6 @@ vm_object_shadow( * [Furthermore, each routine must cope with the simultaneous * or previous operations of the others.] * - * In addition to the lock on the object, the vm_object_cache_lock - * governs the associations. References gained through the - * association require use of the cache lock. - * * Because the pager field may be cleared spontaneously, it * cannot be used to determine whether a memory object has * ever been associated with a particular vm_object. [This @@ -2585,360 +3965,93 @@ vm_object_shadow( * termination routines and vm_object_collapse.] */ -#if 0 -static void vm_object_abort_activity( - vm_object_t object); /* - * Routine: vm_object_abort_activity [internal use only] + * Routine: vm_object_memory_object_associate * Purpose: - * Abort paging requests pending on this object. - * In/out conditions: - * The object is locked on entry and exit. + * Associate a VM object to the given pager. + * If a VM object is not provided, create one. + * Initialize the pager. */ -static void -vm_object_abort_activity( - vm_object_t object) +vm_object_t +vm_object_memory_object_associate( + memory_object_t pager, + vm_object_t object, + vm_object_size_t size, + boolean_t named) { - register - vm_page_t p; - vm_page_t next; - - XPR(XPR_VM_OBJECT, "vm_object_abort_activity, object 0x%X\n", - (integer_t)object, 0, 0, 0, 0); - - /* - * Abort all activity that would be waiting - * for a result on this memory object. - * - * We could also choose to destroy all pages - * that we have in memory for this object, but - * we don't. - */ - - p = (vm_page_t) queue_first(&object->memq); - while (!queue_end(&object->memq, (queue_entry_t) p)) { - next = (vm_page_t) queue_next(&p->listq); + memory_object_control_t control; - /* - * If it's being paged in, destroy it. - * If an unlock has been requested, start it again. - */ + assert(pager != MEMORY_OBJECT_NULL); - if (p->busy && p->absent) { - VM_PAGE_FREE(p); - } - else { - if (p->unlock_request != VM_PROT_NONE) - p->unlock_request = VM_PROT_NONE; - PAGE_WAKEUP(p); - } - - p = next; + if (object != VM_OBJECT_NULL) { + assert(object->internal); + assert(object->pager_created); + assert(!object->pager_initialized); + assert(!object->pager_ready); + assert(object->pager_trusted); + } else { + object = vm_object_allocate(size); + assert(object != VM_OBJECT_NULL); + object->internal = FALSE; + object->pager_trusted = FALSE; + /* copy strategy invalid until set by memory manager */ + object->copy_strategy = MEMORY_OBJECT_COPY_INVALID; } /* - * Wake up threads waiting for the memory object to - * become ready. - */ - - object->pager_ready = TRUE; - vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY); -} - -/* - * Routine: vm_object_pager_dead - * - * Purpose: - * A port is being destroy, and the IPC kobject code - * can't tell if it represents a pager port or not. - * So this function is called each time it sees a port - * die. - * THIS IS HORRIBLY INEFFICIENT. We should only call - * this routine if we had requested a notification on - * the port. - */ - -__private_extern__ void -vm_object_pager_dead( - ipc_port_t pager) -{ - vm_object_t object; - vm_object_hash_entry_t entry; - - /* - * Perform essentially the same operations as in vm_object_lookup, - * except that this time we look up based on the memory_object - * port, not the control port. + * Allocate request port. */ - vm_object_cache_lock(); - entry = vm_object_hash_lookup(pager, FALSE); - if (entry == VM_OBJECT_HASH_ENTRY_NULL || - entry->object == VM_OBJECT_NULL) { - vm_object_cache_unlock(); - return; - } - object = entry->object; - entry->object = VM_OBJECT_NULL; + control = memory_object_control_allocate(object); + assert(control != MEMORY_OBJECT_CONTROL_NULL); vm_object_lock(object); - if (object->ref_count == 0) { - XPR(XPR_VM_OBJECT_CACHE, - "vm_object_destroy: removing %x from cache, head (%x, %x)\n", - (integer_t)object, - (integer_t)vm_object_cached_list.next, - (integer_t)vm_object_cached_list.prev, 0,0); - - queue_remove(&vm_object_cached_list, object, - vm_object_t, cached_list); - vm_object_cached_count--; - } - object->ref_count++; - vm_object_res_reference(object); - - object->can_persist = FALSE; - assert(object->pager == pager); + assert(!object->pager_ready); + assert(!object->pager_initialized); + assert(object->pager == NULL); + assert(object->pager_control == NULL); /* - * Remove the pager association. - * - * Note that the memory_object itself is dead, so - * we don't bother with it. + * Copy the reference we were given. */ - object->pager = MEMORY_OBJECT_NULL; + memory_object_reference(pager); + object->pager_created = TRUE; + object->pager = pager; + object->pager_control = control; + object->pager_ready = FALSE; vm_object_unlock(object); - vm_object_cache_unlock(); - - vm_object_pager_wakeup(pager); /* - * Release the pager reference. Note that there's no - * point in trying the memory_object_terminate call - * because the memory_object itself is dead. Also - * release the memory_object_control reference, since - * the pager didn't do that either. + * Let the pager know we're using it. */ - memory_object_deallocate(pager); - memory_object_control_deallocate(object->pager_request); - + (void) memory_object_init(pager, + object->pager_control, + PAGE_SIZE); - /* - * Restart pending page requests - */ vm_object_lock(object); - vm_object_abort_activity(object); - vm_object_unlock(object); - - /* - * Lose the object reference. - */ - - vm_object_deallocate(object); -} -#endif - -/* - * Routine: vm_object_enter - * Purpose: - * Find a VM object corresponding to the given - * pager; if no such object exists, create one, - * and initialize the pager. - */ -vm_object_t -vm_object_enter( - memory_object_t pager, - vm_object_size_t size, - boolean_t internal, - boolean_t init, - boolean_t named) -{ - register vm_object_t object; - vm_object_t new_object; - boolean_t must_init; - vm_object_hash_entry_t entry, new_entry; - - if (pager == MEMORY_OBJECT_NULL) - return(vm_object_allocate(size)); - - new_object = VM_OBJECT_NULL; - new_entry = VM_OBJECT_HASH_ENTRY_NULL; - must_init = init; - - /* - * Look for an object associated with this port. - */ - - vm_object_cache_lock(); - do { - entry = vm_object_hash_lookup(pager, FALSE); - - if (entry == VM_OBJECT_HASH_ENTRY_NULL) { - if (new_object == VM_OBJECT_NULL) { - /* - * We must unlock to create a new object; - * if we do so, we must try the lookup again. - */ - vm_object_cache_unlock(); - assert(new_entry == VM_OBJECT_HASH_ENTRY_NULL); - new_entry = vm_object_hash_entry_alloc(pager); - new_object = vm_object_allocate(size); - vm_object_cache_lock(); - } else { - /* - * Lookup failed twice, and we have something - * to insert; set the object. - */ - vm_object_hash_insert(new_entry); - entry = new_entry; - entry->object = new_object; - new_entry = VM_OBJECT_HASH_ENTRY_NULL; - new_object = VM_OBJECT_NULL; - must_init = TRUE; - } - } else if (entry->object == VM_OBJECT_NULL) { - /* - * If a previous object is being terminated, - * we must wait for the termination message - * to be queued (and lookup the entry again). - */ - entry->waiting = TRUE; - entry = VM_OBJECT_HASH_ENTRY_NULL; - assert_wait((event_t) pager, THREAD_UNINT); - vm_object_cache_unlock(); - thread_block(THREAD_CONTINUE_NULL); - vm_object_cache_lock(); - } - } while (entry == VM_OBJECT_HASH_ENTRY_NULL); - - object = entry->object; - assert(object != VM_OBJECT_NULL); - - if (!must_init) { - vm_object_lock(object); - assert(!internal || object->internal); - if (named) { - assert(!object->named); - object->named = TRUE; - } - if (object->ref_count == 0) { - XPR(XPR_VM_OBJECT_CACHE, - "vm_object_enter: removing %x from cache, head (%x, %x)\n", - (integer_t)object, - (integer_t)vm_object_cached_list.next, - (integer_t)vm_object_cached_list.prev, 0,0); - queue_remove(&vm_object_cached_list, object, - vm_object_t, cached_list); - vm_object_cached_count--; - } - object->ref_count++; - vm_object_res_reference(object); - vm_object_unlock(object); - - VM_STAT(hits++); - } - assert(object->ref_count > 0); - - VM_STAT(lookups++); - - vm_object_cache_unlock(); - - XPR(XPR_VM_OBJECT, - "vm_o_enter: pager 0x%x obj 0x%x must_init %d\n", - (integer_t)pager, (integer_t)object, must_init, 0, 0); - - /* - * If we raced to create a vm_object but lost, let's - * throw away ours. - */ - - if (new_object != VM_OBJECT_NULL) - vm_object_deallocate(new_object); - - if (new_entry != VM_OBJECT_HASH_ENTRY_NULL) - vm_object_hash_entry_free(new_entry); - - if (must_init) { - memory_object_control_t control; - - /* - * Allocate request port. - */ - - control = memory_object_control_allocate(object); - assert (control != MEMORY_OBJECT_CONTROL_NULL); - - vm_object_lock(object); - assert(object != kernel_object); - - /* - * Copy the reference we were given. - */ - - memory_object_reference(pager); - object->pager_created = TRUE; - object->pager = pager; - object->internal = internal; - object->pager_trusted = internal; - if (!internal) { - /* copy strategy invalid until set by memory manager */ - object->copy_strategy = MEMORY_OBJECT_COPY_INVALID; - } - object->pager_control = control; - object->pager_ready = FALSE; - - vm_object_unlock(object); - - /* - * Let the pager know we're using it. - */ - - (void) memory_object_init(pager, - object->pager_control, - PAGE_SIZE); - - vm_object_lock(object); - if (named) - object->named = TRUE; - if (internal) { - object->pager_ready = TRUE; - vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY); - } - - object->pager_initialized = TRUE; - vm_object_wakeup(object, VM_OBJECT_EVENT_INITIALIZED); - } else { - vm_object_lock(object); + if (named) { + object->named = TRUE; + } + if (object->internal) { + object->pager_ready = TRUE; + vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY); } - /* - * [At this point, the object must be locked] - */ - - /* - * Wait for the work above to be done by the first - * thread to map this object. - */ + object->pager_initialized = TRUE; + vm_object_wakeup(object, VM_OBJECT_EVENT_INITIALIZED); - while (!object->pager_initialized) { - vm_object_sleep(object, - VM_OBJECT_EVENT_INITIALIZED, - THREAD_UNINT); - } vm_object_unlock(object); - XPR(XPR_VM_OBJECT, - "vm_object_enter: vm_object %x, memory_object %x, internal %d\n", - (integer_t)object, (integer_t)object->pager, internal, 0,0); - return(object); + return object; } /* - * Routine: vm_object_pager_create + * Routine: vm_object_compressor_pager_create * Purpose: * Create a memory object for an internal object. * In/out conditions: @@ -2946,30 +4059,20 @@ vm_object_enter( * it may be unlocked within this call. * Limitations: * Only one thread may be performing a - * vm_object_pager_create on an object at + * vm_object_compressor_pager_create on an object at * a time. Presumably, only the pageout * daemon will be using this routine. */ void -vm_object_pager_create( - register vm_object_t object) +vm_object_compressor_pager_create( + vm_object_t object) { - memory_object_t pager; - vm_object_hash_entry_t entry; -#if MACH_PAGEMAP - vm_object_size_t size; - vm_external_map_t map; -#endif /* MACH_PAGEMAP */ - - XPR(XPR_VM_OBJECT, "vm_object_pager_create, object 0x%X\n", - (integer_t)object, 0,0,0,0); + memory_object_t pager; + vm_object_t pager_object = VM_OBJECT_NULL; assert(object != kernel_object); - if (memory_manager_default_check() != KERN_SUCCESS) - return; - /* * Prevent collapse or termination by holding a paging reference */ @@ -2982,74 +4085,71 @@ vm_object_pager_create( */ while (!object->pager_initialized) { vm_object_sleep(object, - VM_OBJECT_EVENT_INITIALIZED, - THREAD_UNINT); + VM_OBJECT_EVENT_INITIALIZED, + THREAD_UNINT); } vm_object_paging_end(object); return; } + if ((uint32_t) (object->vo_size / PAGE_SIZE) != + (object->vo_size / PAGE_SIZE)) { +#if DEVELOPMENT || DEBUG + printf("vm_object_compressor_pager_create(%p): " + "object size 0x%llx >= 0x%llx\n", + object, + (uint64_t) object->vo_size, + 0x0FFFFFFFFULL * PAGE_SIZE); +#endif /* DEVELOPMENT || DEBUG */ + vm_object_paging_end(object); + return; + } + /* * Indicate that a memory object has been assigned * before dropping the lock, to prevent a race. */ object->pager_created = TRUE; + object->pager_trusted = TRUE; object->paging_offset = 0; - -#if MACH_PAGEMAP - size = object->size; -#endif /* MACH_PAGEMAP */ - vm_object_unlock(object); -#if MACH_PAGEMAP - map = vm_external_create(size); - vm_object_lock(object); - assert(object->size == size); - object->existence_map = map; vm_object_unlock(object); -#endif /* MACH_PAGEMAP */ /* * Create the [internal] pager, and associate it with this object. * * We make the association here so that vm_object_enter() - * can look up the object to complete initializing it. No + * can look up the object to complete initializing it. No * user will ever map this object. */ { - memory_object_default_t dmm; - vm_size_t cluster_size; - - /* acquire a reference for the default memory manager */ - dmm = memory_manager_default_reference(&cluster_size); - assert(cluster_size >= PAGE_SIZE); - - object->cluster_size = cluster_size; /* XXX ??? */ - assert(object->temporary); - /* create our new memory object */ - (void) memory_object_create(dmm, object->size, &pager); - - memory_object_default_deallocate(dmm); - } - - entry = vm_object_hash_entry_alloc(pager); - - vm_object_cache_lock(); - vm_object_hash_insert(entry); - - entry->object = object; - vm_object_cache_unlock(); + assert((uint32_t) (object->vo_size / PAGE_SIZE) == + (object->vo_size / PAGE_SIZE)); + (void) compressor_memory_object_create( + (memory_object_size_t) object->vo_size, + &pager); + if (pager == NULL) { + panic("vm_object_compressor_pager_create(): " + "no pager for object %p size 0x%llx\n", + object, (uint64_t) object->vo_size); + } + } /* * A reference was returned by * memory_object_create(), and it is - * copied by vm_object_enter(). + * copied by vm_object_memory_object_associate(). */ - if (vm_object_enter(pager, object->size, TRUE, TRUE, FALSE) != object) - panic("vm_object_pager_create: mismatch"); + pager_object = vm_object_memory_object_associate(pager, + object, + object->vo_size, + FALSE); + if (pager_object != object) { + panic("vm_object_compressor_pager_create: mismatch (pager: %p, pager_object: %p, orig_object: %p, orig_object size: 0x%llx)\n", pager, pager_object, object, (uint64_t) object->vo_size); + } /* * Drop the reference we were passed. @@ -3065,45 +4165,93 @@ vm_object_pager_create( } /* - * Routine: vm_object_remove - * Purpose: - * Eliminate the pager/object association - * for this pager. - * Conditions: - * The object cache must be locked. + * Global variables for vm_object_collapse(): + * + * Counts for normal collapses and bypasses. + * Debugging variables, to watch or disable collapse. */ -__private_extern__ void -vm_object_remove( - vm_object_t object) +static long object_collapses = 0; +static long object_bypasses = 0; + +static boolean_t vm_object_collapse_allowed = TRUE; +static boolean_t vm_object_bypass_allowed = TRUE; + +void vm_object_do_collapse_compressor(vm_object_t object, + vm_object_t backing_object); +void +vm_object_do_collapse_compressor( + vm_object_t object, + vm_object_t backing_object) { - memory_object_t pager; + vm_object_offset_t new_offset, backing_offset; + vm_object_size_t size; - if ((pager = object->pager) != MEMORY_OBJECT_NULL) { - vm_object_hash_entry_t entry; + vm_counters.do_collapse_compressor++; - entry = vm_object_hash_lookup(pager, FALSE); - if (entry != VM_OBJECT_HASH_ENTRY_NULL) - entry->object = VM_OBJECT_NULL; - } + vm_object_lock_assert_exclusive(object); + vm_object_lock_assert_exclusive(backing_object); -} + size = object->vo_size; -/* - * Global variables for vm_object_collapse(): - * - * Counts for normal collapses and bypasses. - * Debugging variables, to watch or disable collapse. - */ -static long object_collapses = 0; -static long object_bypasses = 0; + /* + * Move all compressed pages from backing_object + * to the parent. + */ + + for (backing_offset = object->vo_shadow_offset; + backing_offset < object->vo_shadow_offset + object->vo_size; + backing_offset += PAGE_SIZE) { + memory_object_offset_t backing_pager_offset; + + /* find the next compressed page at or after this offset */ + backing_pager_offset = (backing_offset + + backing_object->paging_offset); + backing_pager_offset = vm_compressor_pager_next_compressed( + backing_object->pager, + backing_pager_offset); + if (backing_pager_offset == (memory_object_offset_t) -1) { + /* no more compressed pages */ + break; + } + backing_offset = (backing_pager_offset - + backing_object->paging_offset); -static boolean_t vm_object_collapse_allowed = TRUE; -static boolean_t vm_object_bypass_allowed = TRUE; + new_offset = backing_offset - object->vo_shadow_offset; -static int vm_external_discarded; -static int vm_external_collapsed; + if (new_offset >= object->vo_size) { + /* we're out of the scope of "object": done */ + break; + } + + if ((vm_page_lookup(object, new_offset) != VM_PAGE_NULL) || + (vm_compressor_pager_state_get(object->pager, + (new_offset + + object->paging_offset)) == + VM_EXTERNAL_STATE_EXISTS)) { + /* + * This page already exists in object, resident or + * compressed. + * We don't need this compressed page in backing_object + * and it will be reclaimed when we release + * backing_object. + */ + continue; + } -unsigned long vm_object_collapse_encrypted = 0; + /* + * backing_object has this page in the VM compressor and + * we need to transfer it to object. + */ + vm_counters.do_collapse_compressor_pages++; + vm_compressor_pager_transfer( + /* destination: */ + object->pager, + (new_offset + object->paging_offset), + /* source: */ + backing_object->pager, + (backing_offset + backing_object->paging_offset)); + } +} /* * Routine: vm_object_do_collapse @@ -3125,8 +4273,14 @@ vm_object_do_collapse( vm_object_offset_t new_offset, backing_offset; vm_object_size_t size; - backing_offset = object->shadow_offset; - size = object->size; + vm_object_lock_assert_exclusive(object); + vm_object_lock_assert_exclusive(backing_object); + + assert(object->purgable == VM_PURGABLE_DENY); + assert(backing_object->purgable == VM_PURGABLE_DENY); + + backing_offset = object->vo_shadow_offset; + size = object->vo_size; /* * Move all in-memory pages from backing_object @@ -3134,14 +4288,13 @@ vm_object_do_collapse( * will be overwritten by any of the parent's * pages that shadow them. */ - - while (!queue_empty(&backing_object->memq)) { - - p = (vm_page_t) queue_first(&backing_object->memq); - - new_offset = (p->offset - backing_offset); - - assert(!p->busy || p->absent); + + while (!vm_page_queue_empty(&backing_object->memq)) { + p = (vm_page_t) vm_page_queue_first(&backing_object->memq); + + new_offset = (p->vmp_offset - backing_offset); + + assert(!p->vmp_busy || p->vmp_absent); /* * If the parent has a page here, or if @@ -3150,52 +4303,32 @@ vm_object_do_collapse( * * Otherwise, move it as planned. */ - - if (p->offset < backing_offset || new_offset >= size) { + + if (p->vmp_offset < backing_offset || new_offset >= size) { VM_PAGE_FREE(p); } else { - /* - * ENCRYPTED SWAP: - * The encryption key includes the "pager" and the - * "paging_offset". These might not be the same in - * the new object, so we can't just move an encrypted - * page from one object to the other. We can't just - * decrypt the page here either, because that would drop - * the object lock. - * The caller should check for encrypted pages before - * attempting to collapse. - */ - ASSERT_PAGE_DECRYPTED(p); - pp = vm_page_lookup(object, new_offset); if (pp == VM_PAGE_NULL) { - - /* - * Parent now has no page. - * Move the backing object's page up. - */ - - vm_page_rename(p, object, new_offset); -#if MACH_PAGEMAP - } else if (pp->absent) { - - /* - * Parent has an absent page... - * it's not being paged in, so - * it must really be missing from - * the parent. - * - * Throw out the absent page... - * any faults looking for that - * page will restart with the new - * one. - */ - - VM_PAGE_FREE(pp); - vm_page_rename(p, object, new_offset); -#endif /* MACH_PAGEMAP */ + if (VM_COMPRESSOR_PAGER_STATE_GET(object, + new_offset) + == VM_EXTERNAL_STATE_EXISTS) { + /* + * Parent object has this page + * in the VM compressor. + * Throw away the backing + * object's page. + */ + VM_PAGE_FREE(p); + } else { + /* + * Parent now has no page. + * Move the backing object's page + * up. + */ + vm_page_rename(p, object, new_offset); + } } else { - assert(! pp->absent); + assert(!pp->vmp_absent); /* * Parent object has a real page. @@ -3206,18 +4339,17 @@ vm_object_do_collapse( } } } - -#if !MACH_PAGEMAP - assert(!object->pager_created && object->pager == MEMORY_OBJECT_NULL - || (!backing_object->pager_created - && backing_object->pager == MEMORY_OBJECT_NULL)); -#else - assert(!object->pager_created && object->pager == MEMORY_OBJECT_NULL); -#endif /* !MACH_PAGEMAP */ - - if (backing_object->pager != MEMORY_OBJECT_NULL) { - vm_object_hash_entry_t entry; + if (vm_object_collapse_compressor_allowed && + object->pager != MEMORY_OBJECT_NULL && + backing_object->pager != MEMORY_OBJECT_NULL) { + /* move compressed pages from backing_object to object */ + vm_object_do_collapse_compressor(object, backing_object); + } else if (backing_object->pager != MEMORY_OBJECT_NULL) { + assert((!object->pager_created && + (object->pager == MEMORY_OBJECT_NULL)) || + (!backing_object->pager_created && + (backing_object->pager == MEMORY_OBJECT_NULL))); /* * Move the pager from backing_object to object. * @@ -3227,68 +4359,48 @@ vm_object_do_collapse( */ assert(!object->paging_in_progress); + assert(!object->activity_in_progress); + assert(!object->pager_created); + assert(object->pager == NULL); object->pager = backing_object->pager; - entry = vm_object_hash_lookup(object->pager, FALSE); - assert(entry != VM_OBJECT_HASH_ENTRY_NULL); - entry->object = object; + object->pager_created = backing_object->pager_created; object->pager_control = backing_object->pager_control; object->pager_ready = backing_object->pager_ready; object->pager_initialized = backing_object->pager_initialized; - object->cluster_size = backing_object->cluster_size; object->paging_offset = backing_object->paging_offset + backing_offset; if (object->pager_control != MEMORY_OBJECT_CONTROL_NULL) { memory_object_control_collapse(object->pager_control, - object); + object); } + /* the backing_object has lost its pager: reset all fields */ + backing_object->pager_created = FALSE; + backing_object->pager_control = NULL; + backing_object->pager_ready = FALSE; + backing_object->paging_offset = 0; + backing_object->pager = NULL; } - - vm_object_cache_unlock(); - -#if MACH_PAGEMAP - /* - * If the shadow offset is 0, the use the existence map from - * the backing object if there is one. If the shadow offset is - * not zero, toss it. - * - * XXX - If the shadow offset is not 0 then a bit copy is needed - * if the map is to be salvaged. For now, we just just toss the - * old map, giving the collapsed object no map. This means that - * the pager is invoked for zero fill pages. If analysis shows - * that this happens frequently and is a performance hit, then - * this code should be fixed to salvage the map. - */ - assert(object->existence_map == VM_EXTERNAL_NULL); - if (backing_offset || (size != backing_object->size)) { - vm_external_discarded++; - vm_external_destroy(backing_object->existence_map, - backing_object->size); - } - else { - vm_external_collapsed++; - object->existence_map = backing_object->existence_map; - } - backing_object->existence_map = VM_EXTERNAL_NULL; -#endif /* MACH_PAGEMAP */ - /* * Object now shadows whatever backing_object did. * Note that the reference to backing_object->shadow * moves from within backing_object to within object. */ - + assert(!object->phys_contiguous); assert(!backing_object->phys_contiguous); object->shadow = backing_object->shadow; if (object->shadow) { - object->shadow_offset += backing_object->shadow_offset; + object->vo_shadow_offset += backing_object->vo_shadow_offset; + /* "backing_object" gave its shadow to "object" */ + backing_object->shadow = VM_OBJECT_NULL; + backing_object->vo_shadow_offset = 0; } else { /* no shadow, therefore no shadow offset... */ - object->shadow_offset = 0; + object->vo_shadow_offset = 0; } assert((object->shadow == VM_OBJECT_NULL) || - (object->shadow->copy != backing_object)); + (object->shadow->copy != backing_object)); /* * Discard backing_object. @@ -3297,20 +4409,39 @@ vm_object_do_collapse( * pager left, and no object references within it, * all that is necessary is to dispose of it. */ - - assert((backing_object->ref_count == 1) && - (backing_object->resident_page_count == 0) && - (backing_object->paging_in_progress == 0)); + object_collapses++; + + assert(backing_object->ref_count == 1); + assert(backing_object->resident_page_count == 0); + assert(backing_object->paging_in_progress == 0); + assert(backing_object->activity_in_progress == 0); + assert(backing_object->shadow == VM_OBJECT_NULL); + assert(backing_object->vo_shadow_offset == 0); + + if (backing_object->pager != MEMORY_OBJECT_NULL) { + /* ... unless it has a pager; need to terminate pager too */ + vm_counters.do_collapse_terminate++; + if (vm_object_terminate(backing_object) != KERN_SUCCESS) { + vm_counters.do_collapse_terminate_failure++; + } + return; + } + + assert(backing_object->pager == NULL); backing_object->alive = FALSE; vm_object_unlock(backing_object); - XPR(XPR_VM_OBJECT, "vm_object_collapse, collapsed 0x%X\n", - (integer_t)backing_object, 0,0,0,0); +#if VM_OBJECT_TRACKING + if (vm_object_tracking_inited) { + btlog_remove_entries_for_element(vm_object_tracking_btlog, + backing_object); + } +#endif /* VM_OBJECT_TRACKING */ + + vm_object_lock_destroy(backing_object); zfree(vm_object_zone, backing_object); - - object_collapses++; } static void @@ -3322,10 +4453,13 @@ vm_object_do_bypass( * Make the parent shadow the next object * in the chain. */ - -#if TASK_SWAPPER + + vm_object_lock_assert_exclusive(object); + vm_object_lock_assert_exclusive(backing_object); + +#if TASK_SWAPPER /* - * Do object reference in-line to + * Do object reference in-line to * conditionally increment shadow's * residence count. If object is not * resident, leave residence count @@ -3333,48 +4467,50 @@ vm_object_do_bypass( */ if (backing_object->shadow != VM_OBJECT_NULL) { vm_object_lock(backing_object->shadow); + vm_object_lock_assert_exclusive(backing_object->shadow); backing_object->shadow->ref_count++; - if (object->res_count != 0) + if (object->res_count != 0) { vm_object_res_reference(backing_object->shadow); + } vm_object_unlock(backing_object->shadow); } -#else /* TASK_SWAPPER */ +#else /* TASK_SWAPPER */ vm_object_reference(backing_object->shadow); -#endif /* TASK_SWAPPER */ +#endif /* TASK_SWAPPER */ assert(!object->phys_contiguous); assert(!backing_object->phys_contiguous); object->shadow = backing_object->shadow; if (object->shadow) { - object->shadow_offset += backing_object->shadow_offset; + object->vo_shadow_offset += backing_object->vo_shadow_offset; } else { /* no shadow, therefore no shadow offset... */ - object->shadow_offset = 0; + object->vo_shadow_offset = 0; } - + /* * Backing object might have had a copy pointer - * to us. If it did, clear it. + * to us. If it did, clear it. */ if (backing_object->copy == object) { backing_object->copy = VM_OBJECT_NULL; } - + /* * Drop the reference count on backing_object. -#if TASK_SWAPPER + #if TASK_SWAPPER * Since its ref_count was at least 2, it * will not vanish; so we don't need to call * vm_object_deallocate. - * [FBDP: that doesn't seem to be true any more] - * + * [with a caveat for "named" objects] + * * The res_count on the backing object is * conditionally decremented. It's possible * (via vm_pageout_scan) to get here with * a "swapped" object, which has a 0 res_count, * in which case, the backing object res_count * is already down by one. -#else + #else * Don't call vm_object_deallocate unless * ref_count drops to zero. * @@ -3382,30 +4518,44 @@ vm_object_do_bypass( * backing object could be bypassed but not * collapsed, such as when the backing object * is temporary and cachable. -#endif + #endif */ - if (backing_object->ref_count > 1) { + if (backing_object->ref_count > 2 || + (!backing_object->named && backing_object->ref_count > 1)) { + vm_object_lock_assert_exclusive(backing_object); backing_object->ref_count--; -#if TASK_SWAPPER - if (object->res_count != 0) +#if TASK_SWAPPER + if (object->res_count != 0) { vm_object_res_deallocate(backing_object); + } assert(backing_object->ref_count > 0); -#endif /* TASK_SWAPPER */ +#endif /* TASK_SWAPPER */ vm_object_unlock(backing_object); } else { - /* * Drop locks so that we can deallocate * the backing object. */ -#if TASK_SWAPPER +#if TASK_SWAPPER if (object->res_count == 0) { /* XXX get a reference for the deallocate below */ vm_object_res_reference(backing_object); } -#endif /* TASK_SWAPPER */ +#endif /* TASK_SWAPPER */ + /* + * vm_object_collapse (the caller of this function) is + * now called from contexts that may not guarantee that a + * valid reference is held on the object... w/o a valid + * reference, it is unsafe and unwise (you will definitely + * regret it) to unlock the object and then retake the lock + * since the object may be terminated and recycled in between. + * The "activity_in_progress" reference will keep the object + * 'stable'. + */ + vm_object_activity_begin(object); vm_object_unlock(object); + vm_object_unlock(backing_object); vm_object_deallocate(backing_object); @@ -3417,12 +4567,13 @@ vm_object_do_bypass( */ vm_object_lock(object); + vm_object_activity_end(object); } - + object_bypasses++; } - + /* * vm_object_collapse: * @@ -3437,33 +4588,46 @@ static unsigned long vm_object_collapse_calls = 0; static unsigned long vm_object_collapse_objects = 0; static unsigned long vm_object_collapse_do_collapse = 0; static unsigned long vm_object_collapse_do_bypass = 0; + __private_extern__ void vm_object_collapse( - register vm_object_t object, - register vm_object_offset_t hint_offset) + vm_object_t object, + vm_object_offset_t hint_offset, + boolean_t can_bypass) { - register vm_object_t backing_object; - register unsigned int rcount; - register unsigned int size; - vm_object_offset_t collapse_min_offset; - vm_object_offset_t collapse_max_offset; - vm_page_t page; - vm_object_t original_object; + vm_object_t backing_object; + unsigned int rcount; + unsigned int size; + vm_object_t original_object; + int object_lock_type; + int backing_object_lock_type; vm_object_collapse_calls++; - if (! vm_object_collapse_allowed && ! vm_object_bypass_allowed) { + if (!vm_object_collapse_allowed && + !(can_bypass && vm_object_bypass_allowed)) { return; } - XPR(XPR_VM_OBJECT, "vm_object_collapse, obj 0x%X\n", - (integer_t)object, 0,0,0,0); - - if (object == VM_OBJECT_NULL) + if (object == VM_OBJECT_NULL) { return; + } original_object = object; + /* + * The top object was locked "exclusive" by the caller. + * In the first pass, to determine if we can collapse the shadow chain, + * take a "shared" lock on the shadow objects. If we can collapse, + * we'll have to go down the chain again with exclusive locks. + */ + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + backing_object_lock_type = OBJECT_LOCK_SHARED; + +retry: + object = original_object; + vm_object_lock_assert_exclusive(object); + while (TRUE) { vm_object_collapse_objects++; /* @@ -3474,7 +4638,7 @@ vm_object_collapse( /* * There is a backing object, and */ - + backing_object = object->shadow; if (backing_object == VM_OBJECT_NULL) { if (object != original_object) { @@ -3482,24 +4646,27 @@ vm_object_collapse( } return; } - + if (backing_object_lock_type == OBJECT_LOCK_SHARED) { + vm_object_lock_shared(backing_object); + } else { + vm_object_lock(backing_object); + } + /* * No pages in the object are currently * being paged out, and */ if (object->paging_in_progress != 0 || - object->absent_count != 0) { + object->activity_in_progress != 0) { /* try and collapse the rest of the shadow chain */ - vm_object_lock(backing_object); if (object != original_object) { vm_object_unlock(object); } object = backing_object; + object_lock_type = backing_object_lock_type; continue; } - vm_object_lock(backing_object); - /* * ... * The backing object is not read_only, @@ -3508,17 +4675,42 @@ vm_object_collapse( * The backing object is internal. * */ - + if (!backing_object->internal || - backing_object->paging_in_progress != 0) { + backing_object->paging_in_progress != 0 || + backing_object->activity_in_progress != 0) { + /* try and collapse the rest of the shadow chain */ + if (object != original_object) { + vm_object_unlock(object); + } + object = backing_object; + object_lock_type = backing_object_lock_type; + continue; + } + + /* + * Purgeable objects are not supposed to engage in + * copy-on-write activities, so should not have + * any shadow objects or be a shadow object to another + * object. + * Collapsing a purgeable object would require some + * updates to the purgeable compressed ledgers. + */ + if (object->purgable != VM_PURGABLE_DENY || + backing_object->purgable != VM_PURGABLE_DENY) { + panic("vm_object_collapse() attempting to collapse " + "purgeable object: %p(%d) %p(%d)\n", + object, object->purgable, + backing_object, backing_object->purgable); /* try and collapse the rest of the shadow chain */ if (object != original_object) { vm_object_unlock(object); } object = backing_object; + object_lock_type = backing_object_lock_type; continue; } - + /* * The backing object can't be a copy-object: * the shadow_offset for the copy-object must stay @@ -3536,6 +4728,7 @@ vm_object_collapse( vm_object_unlock(object); } object = backing_object; + object_lock_type = backing_object_lock_type; continue; } @@ -3549,75 +4742,40 @@ vm_object_collapse( * object, we may be able to collapse it into the * parent. * - * If MACH_PAGEMAP is defined: - * The parent must not have a pager created for it, - * since collapsing a backing_object dumps new pages - * into the parent that its pager doesn't know about - * (and the collapse code can't merge the existence - * maps). - * Otherwise: * As long as one of the objects is still not known * to the pager, we can collapse them. */ if (backing_object->ref_count == 1 && - (!object->pager_created -#if !MACH_PAGEMAP - || !backing_object->pager_created -#endif /*!MACH_PAGEMAP */ + (vm_object_collapse_compressor_allowed || + !object->pager_created + || (!backing_object->pager_created) ) && vm_object_collapse_allowed) { - - XPR(XPR_VM_OBJECT, - "vm_object_collapse: %x to %x, pager %x, pager_control %x\n", - (integer_t)backing_object, (integer_t)object, - (integer_t)backing_object->pager, - (integer_t)backing_object->pager_control, 0); - /* - * We need the cache lock for collapsing, - * but we must not deadlock. + * We need the exclusive lock on the VM objects. */ - - if (! vm_object_cache_lock_try()) { + if (backing_object_lock_type != OBJECT_LOCK_EXCLUSIVE) { + /* + * We have an object and its shadow locked + * "shared". We can't just upgrade the locks + * to "exclusive", as some other thread might + * also have these objects locked "shared" and + * attempt to upgrade one or the other to + * "exclusive". The upgrades would block + * forever waiting for the other "shared" locks + * to get released. + * So we have to release the locks and go + * down the shadow chain again (since it could + * have changed) with "exclusive" locking. + */ + vm_object_unlock(backing_object); if (object != original_object) { vm_object_unlock(object); } - vm_object_unlock(backing_object); - return; + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + backing_object_lock_type = OBJECT_LOCK_EXCLUSIVE; + goto retry; } - /* - * ENCRYPTED SWAP - * We can't collapse the object if it contains - * any encypted page, because the encryption key - * includes the info. We can't - * drop the object lock in vm_object_do_collapse() - * so we can't decrypt the page there either. - */ - if (vm_pages_encrypted) { - collapse_min_offset = object->shadow_offset; - collapse_max_offset = - object->shadow_offset + object->size; - queue_iterate(&backing_object->memq, - page, vm_page_t, listq) { - if (page->encrypted && - (page->offset >= - collapse_min_offset) && - (page->offset < - collapse_max_offset)) { - /* - * We found an encrypted page - * in the backing object, - * within the range covered - * by the parent object: we can - * not collapse them. - */ - vm_object_collapse_encrypted++; - vm_object_cache_unlock(); - goto try_bypass; - } - } - } - /* * Collapse the object with its backing * object, and try again with the object's @@ -3629,18 +4787,18 @@ vm_object_collapse( continue; } - try_bypass: /* * Collapsing the backing object was not possible * or permitted, so let's try bypassing it. */ - if (! vm_object_bypass_allowed) { + if (!(can_bypass && vm_object_bypass_allowed)) { /* try and collapse the rest of the shadow chain */ if (object != original_object) { vm_object_unlock(object); } object = backing_object; + object_lock_type = backing_object_lock_type; continue; } @@ -3650,29 +4808,26 @@ vm_object_collapse( * we have to make sure no pages in the backing object * "show through" before bypassing it. */ - size = atop(object->size); + size = (unsigned int)atop(object->vo_size); rcount = object->resident_page_count; + if (rcount != size) { - vm_object_offset_t offset; - vm_object_offset_t backing_offset; - unsigned int backing_rcount; - unsigned int lookups = 0; + vm_object_offset_t offset; + vm_object_offset_t backing_offset; + unsigned int backing_rcount; /* * If the backing object has a pager but no pagemap, * then we cannot bypass it, because we don't know * what pages it has. */ - if (backing_object->pager_created -#if MACH_PAGEMAP - && (backing_object->existence_map == VM_EXTERNAL_NULL) -#endif /* MACH_PAGEMAP */ - ) { + if (backing_object->pager_created) { /* try and collapse the rest of the shadow chain */ if (object != original_object) { vm_object_unlock(object); } object = backing_object; + object_lock_type = backing_object_lock_type; continue; } @@ -3681,16 +4836,31 @@ vm_object_collapse( * then we cannot bypass it, because we don't know * what pages it has. */ - if (object->pager_created -#if MACH_PAGEMAP - && (object->existence_map == VM_EXTERNAL_NULL) -#endif /* MACH_PAGEMAP */ - ) { + if (object->pager_created) { /* try and collapse the rest of the shadow chain */ if (object != original_object) { vm_object_unlock(object); } object = backing_object; + object_lock_type = backing_object_lock_type; + continue; + } + + backing_offset = object->vo_shadow_offset; + backing_rcount = backing_object->resident_page_count; + + if ((int)backing_rcount - (int)(atop(backing_object->vo_size) - size) > (int)rcount) { + /* + * we have enough pages in the backing object to guarantee that + * at least 1 of them must be 'uncovered' by a resident page + * in the object we're evaluating, so move on and + * try to collapse the rest of the shadow chain + */ + if (object != original_object) { + vm_object_unlock(object); + } + object = backing_object; + object_lock_type = backing_object_lock_type; continue; } @@ -3707,34 +4877,33 @@ vm_object_collapse( * */ - backing_offset = object->shadow_offset; - backing_rcount = backing_object->resident_page_count; - -#define EXISTS_IN_OBJECT(obj, off, rc) \ - (vm_external_state_get((obj)->existence_map, \ - (vm_offset_t)(off)) == VM_EXTERNAL_STATE_EXISTS || \ - ((rc) && ++lookups && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--)) +#define EXISTS_IN_OBJECT(obj, off, rc) \ + ((VM_COMPRESSOR_PAGER_STATE_GET((obj), (off)) \ + == VM_EXTERNAL_STATE_EXISTS) || \ + ((rc) && vm_page_lookup((obj), (off)) != VM_PAGE_NULL && (rc)--)) /* * Check the hint location first * (since it is often the quickest way out of here). */ - if (object->cow_hint != ~(vm_offset_t)0) + if (object->cow_hint != ~(vm_offset_t)0) { hint_offset = (vm_object_offset_t)object->cow_hint; - else + } else { hint_offset = (hint_offset > 8 * PAGE_SIZE_64) ? - (hint_offset - 8 * PAGE_SIZE_64) : 0; + (hint_offset - 8 * PAGE_SIZE_64) : 0; + } if (EXISTS_IN_OBJECT(backing_object, hint_offset + - backing_offset, backing_rcount) && + backing_offset, backing_rcount) && !EXISTS_IN_OBJECT(object, hint_offset, rcount)) { /* dependency right at the hint */ - object->cow_hint = (vm_offset_t)hint_offset; + object->cow_hint = (vm_offset_t) hint_offset; /* atomic */ /* try and collapse the rest of the shadow chain */ if (object != original_object) { vm_object_unlock(object); } object = backing_object; + object_lock_type = backing_object_lock_type; continue; } @@ -3744,46 +4913,40 @@ vm_object_collapse( * pages in the backing object, it makes sense to * walk the backing_object's resident pages first. * - * NOTE: Pages may be in both the existence map and - * resident. So, we can't permanently decrement - * the rcount here because the second loop may - * find the same pages in the backing object' - * existence map that we found here and we would - * double-decrement the rcount. We also may or - * may not have found the + * NOTE: Pages may be in both the existence map and/or + * resident, so if we don't find a dependency while + * walking the backing object's resident page list + * directly, and there is an existence map, we'll have + * to run the offset based 2nd pass. Because we may + * have to run both passes, we need to be careful + * not to decrement 'rcount' in the 1st pass */ - if (backing_rcount && size > - ((backing_object->existence_map) ? - backing_rcount : (backing_rcount >> 1))) { + if (backing_rcount && backing_rcount < (size / 8)) { unsigned int rc = rcount; vm_page_t p; backing_rcount = backing_object->resident_page_count; - p = (vm_page_t)queue_first(&backing_object->memq); + p = (vm_page_t)vm_page_queue_first(&backing_object->memq); do { - /* Until we get more than one lookup lock */ - if (lookups > 256) { - lookups = 0; - delay(1); - } + offset = (p->vmp_offset - backing_offset); - offset = (p->offset - backing_offset); - if (offset < object->size && + if (offset < object->vo_size && offset != hint_offset && !EXISTS_IN_OBJECT(object, offset, rc)) { /* found a dependency */ - object->cow_hint = (vm_offset_t)offset; + object->cow_hint = (vm_offset_t) offset; /* atomic */ + break; } - p = (vm_page_t) queue_next(&p->listq); - + p = (vm_page_t) vm_page_queue_next(&p->vmp_listq); } while (--backing_rcount); - if (backing_rcount != 0 ) { + if (backing_rcount != 0) { /* try and collapse the rest of the shadow chain */ if (object != original_object) { vm_object_unlock(object); } object = backing_object; + object_lock_type = backing_object_lock_type; continue; } } @@ -3792,24 +4955,17 @@ vm_object_collapse( * Walk through the offsets looking for pages in the * backing object that show through to the object. */ - if (backing_rcount || backing_object->existence_map) { + if (backing_rcount) { offset = hint_offset; - - while((offset = - (offset + PAGE_SIZE_64 < object->size) ? - (offset + PAGE_SIZE_64) : 0) != hint_offset) { - - /* Until we get more than one lookup lock */ - if (lookups > 256) { - lookups = 0; - delay(1); - } + while ((offset = + (offset + PAGE_SIZE_64 < object->vo_size) ? + (offset + PAGE_SIZE_64) : 0) != hint_offset) { if (EXISTS_IN_OBJECT(backing_object, offset + - backing_offset, backing_rcount) && + backing_offset, backing_rcount) && !EXISTS_IN_OBJECT(object, offset, rcount)) { /* found a dependency */ - object->cow_hint = (vm_offset_t)offset; + object->cow_hint = (vm_offset_t) offset; /* atomic */ break; } } @@ -3819,11 +4975,25 @@ vm_object_collapse( vm_object_unlock(object); } object = backing_object; + object_lock_type = backing_object_lock_type; continue; } } } + /* + * We need "exclusive" locks on the 2 VM objects. + */ + if (backing_object_lock_type != OBJECT_LOCK_EXCLUSIVE) { + vm_object_unlock(backing_object); + if (object != original_object) { + vm_object_unlock(object); + } + object_lock_type = OBJECT_LOCK_EXCLUSIVE; + backing_object_lock_type = OBJECT_LOCK_EXCLUSIVE; + goto retry; + } + /* reset the offset hint for any objects deeper in the chain */ object->cow_hint = (vm_offset_t)0; @@ -3843,9 +5013,12 @@ vm_object_collapse( continue; } - if (object != original_object) { - vm_object_unlock(object); - } + /* NOT REACHED */ + /* + * if (object != original_object) { + * vm_object_unlock(object); + * } + */ } /* @@ -3864,11 +5037,11 @@ unsigned int vm_object_page_remove_iterate = 0; __private_extern__ void vm_object_page_remove( - register vm_object_t object, - register vm_object_offset_t start, - register vm_object_offset_t end) + vm_object_t object, + vm_object_offset_t start, + vm_object_offset_t end) { - register vm_page_t p, next; + vm_page_t p, next; /* * One and two page removals are most popular. @@ -3876,28 +5049,30 @@ vm_object_page_remove( * It balances vm_object_lookup vs iteration. */ - if (atop_64(end - start) < (unsigned)object->resident_page_count/16) { + if (atop_64(end - start) < (unsigned)object->resident_page_count / 16) { vm_object_page_remove_lookup++; for (; start < end; start += PAGE_SIZE_64) { p = vm_page_lookup(object, start); if (p != VM_PAGE_NULL) { - assert(!p->cleaning && !p->pageout); - if (!p->fictitious) - pmap_disconnect(p->phys_page); + assert(!p->vmp_cleaning && !p->vmp_laundry); + if (!p->vmp_fictitious && p->vmp_pmapped) { + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); + } VM_PAGE_FREE(p); } } } else { vm_object_page_remove_iterate++; - p = (vm_page_t) queue_first(&object->memq); - while (!queue_end(&object->memq, (queue_entry_t) p)) { - next = (vm_page_t) queue_next(&p->listq); - if ((start <= p->offset) && (p->offset < end)) { - assert(!p->cleaning && !p->pageout); - if (!p->fictitious) - pmap_disconnect(p->phys_page); + p = (vm_page_t) vm_page_queue_first(&object->memq); + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t) p)) { + next = (vm_page_t) vm_page_queue_next(&p->vmp_listq); + if ((start <= p->vmp_offset) && (p->vmp_offset < end)) { + assert(!p->vmp_cleaning && !p->vmp_laundry); + if (!p->vmp_fictitious && p->vmp_pmapped) { + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); + } VM_PAGE_FREE(p); } p = next; @@ -3933,37 +5108,33 @@ static int vm_object_coalesce_count = 0; __private_extern__ boolean_t vm_object_coalesce( - register vm_object_t prev_object, - vm_object_t next_object, - vm_object_offset_t prev_offset, + vm_object_t prev_object, + vm_object_t next_object, + vm_object_offset_t prev_offset, __unused vm_object_offset_t next_offset, - vm_object_size_t prev_size, - vm_object_size_t next_size) + vm_object_size_t prev_size, + vm_object_size_t next_size) { - vm_object_size_t newsize; + vm_object_size_t newsize; -#ifdef lint +#ifdef lint next_offset++; -#endif /* lint */ +#endif /* lint */ if (next_object != VM_OBJECT_NULL) { - return(FALSE); + return FALSE; } if (prev_object == VM_OBJECT_NULL) { - return(TRUE); + return TRUE; } - XPR(XPR_VM_OBJECT, - "vm_object_coalesce: 0x%X prev_off 0x%X prev_size 0x%X next_size 0x%X\n", - (integer_t)prev_object, prev_offset, prev_size, next_size, 0); - vm_object_lock(prev_object); /* * Try to collapse the object first */ - vm_object_collapse(prev_object, prev_offset); + vm_object_collapse(prev_object, prev_offset, TRUE); /* * Can't coalesce if pages not mapped to @@ -3972,7 +5143,7 @@ vm_object_coalesce( * . paged out * . shadows another object * . has a copy elsewhere - * . is purgable + * . is purgeable * . paging references (pages might be in page-list) */ @@ -3981,10 +5152,11 @@ vm_object_coalesce( (prev_object->shadow != VM_OBJECT_NULL) || (prev_object->copy != VM_OBJECT_NULL) || (prev_object->true_share != FALSE) || - (prev_object->purgable != VM_OBJECT_NONPURGABLE) || - (prev_object->paging_in_progress != 0)) { + (prev_object->purgable != VM_PURGABLE_DENY) || + (prev_object->paging_in_progress != 0) || + (prev_object->activity_in_progress != 0)) { vm_object_unlock(prev_object); - return(FALSE); + return FALSE; } vm_object_coalesce_count++; @@ -3994,628 +5166,150 @@ vm_object_coalesce( * a previous deallocation. */ vm_object_page_remove(prev_object, - prev_offset + prev_size, - prev_offset + prev_size + next_size); + prev_offset + prev_size, + prev_offset + prev_size + next_size); /* * Extend the object if necessary. */ newsize = prev_offset + prev_size + next_size; - if (newsize > prev_object->size) { -#if MACH_PAGEMAP - /* - * We cannot extend an object that has existence info, - * since the existence info might then fail to cover - * the entire object. - * - * This assertion must be true because the object - * has no pager, and we only create existence info - * for objects with pagers. - */ - assert(prev_object->existence_map == VM_EXTERNAL_NULL); -#endif /* MACH_PAGEMAP */ - prev_object->size = newsize; + if (newsize > prev_object->vo_size) { + prev_object->vo_size = newsize; } vm_object_unlock(prev_object); - return(TRUE); -} - -/* - * Attach a set of physical pages to an object, so that they can - * be mapped by mapping the object. Typically used to map IO memory. - * - * The mapping function and its private data are used to obtain the - * physical addresses for each page to be mapped. - */ -void -vm_object_page_map( - vm_object_t object, - vm_object_offset_t offset, - vm_object_size_t size, - vm_object_offset_t (*map_fn)(void *map_fn_data, - vm_object_offset_t offset), - void *map_fn_data) /* private to map_fn */ -{ - int num_pages; - int i; - vm_page_t m; - vm_page_t old_page; - vm_object_offset_t addr; - - num_pages = atop_64(size); - - for (i = 0; i < num_pages; i++, offset += PAGE_SIZE_64) { - - addr = (*map_fn)(map_fn_data, offset); - - while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL) - vm_page_more_fictitious(); - - vm_object_lock(object); - if ((old_page = vm_page_lookup(object, offset)) - != VM_PAGE_NULL) - { - vm_page_lock_queues(); - vm_page_free(old_page); - vm_page_unlock_queues(); - } - - vm_page_init(m, addr); - /* private normally requires lock_queues but since we */ - /* are initializing the page, its not necessary here */ - m->private = TRUE; /* don`t free page */ - m->wire_count = 1; - vm_page_insert(m, object, offset); - - PAGE_WAKEUP_DONE(m); - vm_object_unlock(object); - } -} - -#include - -#if MACH_KDB -#include -#include - -#define printf kdbprintf - -extern boolean_t vm_object_cached( - vm_object_t object); - -extern void print_bitstring( - char byte); - -boolean_t vm_object_print_pages = FALSE; - -void -print_bitstring( - char byte) -{ - printf("%c%c%c%c%c%c%c%c", - ((byte & (1 << 0)) ? '1' : '0'), - ((byte & (1 << 1)) ? '1' : '0'), - ((byte & (1 << 2)) ? '1' : '0'), - ((byte & (1 << 3)) ? '1' : '0'), - ((byte & (1 << 4)) ? '1' : '0'), - ((byte & (1 << 5)) ? '1' : '0'), - ((byte & (1 << 6)) ? '1' : '0'), - ((byte & (1 << 7)) ? '1' : '0')); -} - -boolean_t -vm_object_cached( - register vm_object_t object) -{ - register vm_object_t o; - - queue_iterate(&vm_object_cached_list, o, vm_object_t, cached_list) { - if (object == o) { - return TRUE; - } - } - return FALSE; -} - -#if MACH_PAGEMAP -/* - * vm_external_print: [ debug ] - */ -void -vm_external_print( - vm_external_map_t emap, - vm_size_t size) -{ - if (emap == VM_EXTERNAL_NULL) { - printf("0 "); - } else { - vm_size_t existence_size = stob(size); - printf("{ size=%d, map=[", existence_size); - if (existence_size > 0) { - print_bitstring(emap[0]); - } - if (existence_size > 1) { - print_bitstring(emap[1]); - } - if (existence_size > 2) { - printf("..."); - print_bitstring(emap[existence_size-1]); - } - printf("] }\n"); - } - return; -} -#endif /* MACH_PAGEMAP */ - -int -vm_follow_object( - vm_object_t object) -{ - int count = 0; - int orig_db_indent = db_indent; - - while (TRUE) { - if (object == VM_OBJECT_NULL) { - db_indent = orig_db_indent; - return count; - } - - count += 1; - - iprintf("object 0x%x", object); - printf(", shadow=0x%x", object->shadow); - printf(", copy=0x%x", object->copy); - printf(", pager=0x%x", object->pager); - printf(", ref=%d\n", object->ref_count); - - db_indent += 2; - object = object->shadow; - } - -} - -/* - * vm_object_print: [ debug ] - */ -void -vm_object_print( - db_addr_t db_addr, - __unused boolean_t have_addr, - __unused int arg_count, - __unused char *modif) -{ - vm_object_t object; - register vm_page_t p; - const char *s; - - register int count; - - object = (vm_object_t) (long) db_addr; - if (object == VM_OBJECT_NULL) - return; - - iprintf("object 0x%x\n", object); - - db_indent += 2; - - iprintf("size=0x%x", object->size); - printf(", cluster=0x%x", object->cluster_size); - printf(", memq_hint=%p", object->memq_hint); - printf(", ref_count=%d\n", object->ref_count); - iprintf(""); -#if TASK_SWAPPER - printf("res_count=%d, ", object->res_count); -#endif /* TASK_SWAPPER */ - printf("resident_page_count=%d\n", object->resident_page_count); - - iprintf("shadow=0x%x", object->shadow); - if (object->shadow) { - register int i = 0; - vm_object_t shadow = object; - while((shadow = shadow->shadow)) - i++; - printf(" (depth %d)", i); - } - printf(", copy=0x%x", object->copy); - printf(", shadow_offset=0x%x", object->shadow_offset); - printf(", last_alloc=0x%x\n", object->last_alloc); - - iprintf("pager=0x%x", object->pager); - printf(", paging_offset=0x%x", object->paging_offset); - printf(", pager_control=0x%x\n", object->pager_control); - - iprintf("copy_strategy=%d[", object->copy_strategy); - switch (object->copy_strategy) { - case MEMORY_OBJECT_COPY_NONE: - printf("copy_none"); - break; - - case MEMORY_OBJECT_COPY_CALL: - printf("copy_call"); - break; - - case MEMORY_OBJECT_COPY_DELAY: - printf("copy_delay"); - break; - - case MEMORY_OBJECT_COPY_SYMMETRIC: - printf("copy_symmetric"); - break; - - case MEMORY_OBJECT_COPY_INVALID: - printf("copy_invalid"); - break; - - default: - printf("?"); - } - printf("]"); - printf(", absent_count=%d\n", object->absent_count); - - iprintf("all_wanted=0x%x<", object->all_wanted); - s = ""; - if (vm_object_wanted(object, VM_OBJECT_EVENT_INITIALIZED)) { - printf("%sinit", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_PAGER_READY)) { - printf("%sready", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS)) { - printf("%spaging", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_ABSENT_COUNT)) { - printf("%sabsent", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_LOCK_IN_PROGRESS)) { - printf("%slock", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_UNCACHING)) { - printf("%suncaching", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_COPY_CALL)) { - printf("%scopy_call", s); - s = ","; - } - if (vm_object_wanted(object, VM_OBJECT_EVENT_CACHING)) { - printf("%scaching", s); - s = ","; - } - printf(">"); - printf(", paging_in_progress=%d\n", object->paging_in_progress); - - iprintf("%screated, %sinit, %sready, %spersist, %strusted, %spageout, %s, %s\n", - (object->pager_created ? "" : "!"), - (object->pager_initialized ? "" : "!"), - (object->pager_ready ? "" : "!"), - (object->can_persist ? "" : "!"), - (object->pager_trusted ? "" : "!"), - (object->pageout ? "" : "!"), - (object->internal ? "internal" : "external"), - (object->temporary ? "temporary" : "permanent")); - iprintf("%salive, %spurgable, %spurgable_volatile, %spurgable_empty, %sshadowed, %scached, %sprivate\n", - (object->alive ? "" : "!"), - ((object->purgable != VM_OBJECT_NONPURGABLE) ? "" : "!"), - ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE) ? "" : "!"), - ((object->purgable == VM_OBJECT_PURGABLE_EMPTY) ? "" : "!"), - (object->shadowed ? "" : "!"), - (vm_object_cached(object) ? "" : "!"), - (object->private ? "" : "!")); - iprintf("%sadvisory_pageout, %ssilent_overwrite\n", - (object->advisory_pageout ? "" : "!"), - (object->silent_overwrite ? "" : "!")); - -#if MACH_PAGEMAP - iprintf("existence_map="); - vm_external_print(object->existence_map, object->size); -#endif /* MACH_PAGEMAP */ -#if MACH_ASSERT - iprintf("paging_object=0x%x\n", object->paging_object); -#endif /* MACH_ASSERT */ - - if (vm_object_print_pages) { - count = 0; - p = (vm_page_t) queue_first(&object->memq); - while (!queue_end(&object->memq, (queue_entry_t) p)) { - if (count == 0) { - iprintf("memory:="); - } else if (count == 2) { - printf("\n"); - iprintf(" ..."); - count = 0; - } else { - printf(","); - } - count++; - - printf("(off=0x%llX,page=%p)", p->offset, p); - p = (vm_page_t) queue_next(&p->listq); - } - if (count != 0) { - printf("\n"); - } - } - db_indent -= 2; -} - - -/* - * vm_object_find [ debug ] - * - * Find all tasks which reference the given vm_object. - */ - -boolean_t vm_object_find(vm_object_t object); -boolean_t vm_object_print_verbose = FALSE; - -boolean_t -vm_object_find( - vm_object_t object) -{ - task_t task; - vm_map_t map; - vm_map_entry_t entry; - processor_set_t pset = &default_pset; - boolean_t found = FALSE; - - queue_iterate(&pset->tasks, task, task_t, pset_tasks) { - map = task->map; - for (entry = vm_map_first_entry(map); - entry && entry != vm_map_to_entry(map); - entry = entry->vme_next) { - - vm_object_t obj; - - /* - * For the time being skip submaps, - * only the kernel can have submaps, - * and unless we are interested in - * kernel objects, we can simply skip - * submaps. See sb/dejan/nmk18b7/src/mach_kernel/vm - * for a full solution. - */ - if (entry->is_sub_map) - continue; - if (entry) - obj = entry->object.vm_object; - else - continue; - - while (obj != VM_OBJECT_NULL) { - if (obj == object) { - if (!found) { - printf("TASK\t\tMAP\t\tENTRY\n"); - found = TRUE; - } - printf("0x%x\t0x%x\t0x%x\n", - task, map, entry); - } - obj = obj->shadow; - } - } - } - - return(found); + return TRUE; } -#endif /* MACH_KDB */ - kern_return_t vm_object_populate_with_private( - vm_object_t object, - vm_object_offset_t offset, - ppnum_t phys_page, - vm_size_t size) + vm_object_t object, + vm_object_offset_t offset, + ppnum_t phys_page, + vm_size_t size) { - ppnum_t base_page; - vm_object_offset_t base_offset; + ppnum_t base_page; + vm_object_offset_t base_offset; - if(!object->private) + if (!object->private) { return KERN_FAILURE; + } base_page = phys_page; vm_object_lock(object); - if(!object->phys_contiguous) { - vm_page_t m; - if((base_offset = trunc_page_64(offset)) != offset) { + + if (!object->phys_contiguous) { + vm_page_t m; + + if ((base_offset = trunc_page_64(offset)) != offset) { vm_object_unlock(object); return KERN_FAILURE; } base_offset += object->paging_offset; - while(size) { + + while (size) { m = vm_page_lookup(object, base_offset); - if(m != VM_PAGE_NULL) { - if(m->fictitious) { - vm_page_lock_queues(); - m->fictitious = FALSE; - m->private = TRUE; - m->phys_page = base_page; - if(!m->busy) { - m->busy = TRUE; + + if (m != VM_PAGE_NULL) { + if (m->vmp_fictitious) { + if (VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr) { + vm_page_lockspin_queues(); + m->vmp_private = TRUE; + vm_page_unlock_queues(); + + m->vmp_fictitious = FALSE; + VM_PAGE_SET_PHYS_PAGE(m, base_page); } - if(!m->absent) { - m->absent = TRUE; - object->absent_count++; + } else if (VM_PAGE_GET_PHYS_PAGE(m) != base_page) { + if (!m->vmp_private) { + /* + * we'd leak a real page... that can't be right + */ + panic("vm_object_populate_with_private - %p not private", m); } - m->list_req_pending = TRUE; - vm_page_unlock_queues(); - } else if (m->phys_page != base_page) { - /* pmap call to clear old mapping */ - pmap_disconnect(m->phys_page); - m->phys_page = base_page; + if (m->vmp_pmapped) { + /* + * pmap call to clear old mapping + */ + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); + } + VM_PAGE_SET_PHYS_PAGE(m, base_page); + } + } else { + while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL) { + vm_page_more_fictitious(); } /* - * ENCRYPTED SWAP: - * We're not pointing to the same - * physical page any longer and the - * contents of the new one are not - * supposed to be encrypted. - * XXX What happens to the original - * physical page. Is it lost ? + * private normally requires lock_queues but since we + * are initializing the page, its not necessary here */ - m->encrypted = FALSE; + m->vmp_private = TRUE; + m->vmp_fictitious = FALSE; + VM_PAGE_SET_PHYS_PAGE(m, base_page); + m->vmp_unusual = TRUE; + m->vmp_busy = FALSE; - } else { - while ((m = vm_page_grab_fictitious()) - == VM_PAGE_NULL) - vm_page_more_fictitious(); - vm_page_lock_queues(); - m->fictitious = FALSE; - m->private = TRUE; - m->phys_page = base_page; - m->list_req_pending = TRUE; - m->absent = TRUE; - m->unusual = TRUE; - object->absent_count++; - vm_page_unlock_queues(); - vm_page_insert(m, object, base_offset); + vm_page_insert(m, object, base_offset); } - base_page++; /* Go to the next physical page */ + base_page++; /* Go to the next physical page */ base_offset += PAGE_SIZE; size -= PAGE_SIZE; } } else { /* NOTE: we should check the original settings here */ /* if we have a size > zero a pmap call should be made */ - /* to disable the range */ + /* to disable the range */ /* pmap_? */ - + /* shadows on contiguous memory are not allowed */ /* we therefore can use the offset field */ - object->shadow_offset = (vm_object_offset_t)(phys_page << 12); - object->size = size; + object->vo_shadow_offset = (vm_object_offset_t)phys_page << PAGE_SHIFT; + object->vo_size = size; } vm_object_unlock(object); + return KERN_SUCCESS; } -/* - * memory_object_free_from_cache: - * - * Walk the vm_object cache list, removing and freeing vm_objects - * which are backed by the pager identified by the caller, (pager_id). - * Remove up to "count" objects, if there are that may available - * in the cache. - * - * Walk the list at most once, return the number of vm_objects - * actually freed. - */ - -__private_extern__ kern_return_t -memory_object_free_from_cache( - __unused host_t host, - int *pager_id, - int *count) -{ - - int object_released = 0; - - register vm_object_t object = VM_OBJECT_NULL; - vm_object_t shadow; - -/* - if(host == HOST_NULL) - return(KERN_INVALID_ARGUMENT); -*/ - - try_again: - vm_object_cache_lock(); - - queue_iterate(&vm_object_cached_list, object, - vm_object_t, cached_list) { - if (object->pager && (pager_id == object->pager->pager)) { - vm_object_lock(object); - queue_remove(&vm_object_cached_list, object, - vm_object_t, cached_list); - vm_object_cached_count--; - - /* - * Since this object is in the cache, we know - * that it is initialized and has only a pager's - * (implicit) reference. Take a reference to avoid - * recursive deallocations. - */ - - assert(object->pager_initialized); - assert(object->ref_count == 0); - object->ref_count++; - - /* - * Terminate the object. - * If the object had a shadow, we let - * vm_object_deallocate deallocate it. - * "pageout" objects have a shadow, but - * maintain a "paging reference" rather - * than a normal reference. - * (We are careful here to limit recursion.) - */ - shadow = object->pageout?VM_OBJECT_NULL:object->shadow; - if ((vm_object_terminate(object) == KERN_SUCCESS) - && (shadow != VM_OBJECT_NULL)) { - vm_object_deallocate(shadow); - } - - if(object_released++ == *count) - return KERN_SUCCESS; - goto try_again; - } - } - vm_object_cache_unlock(); - *count = object_released; - return KERN_SUCCESS; -} - - kern_return_t memory_object_create_named( - memory_object_t pager, - memory_object_offset_t size, - memory_object_control_t *control) + memory_object_t pager, + memory_object_offset_t size, + memory_object_control_t *control) { - vm_object_t object; - vm_object_hash_entry_t entry; + vm_object_t object; *control = MEMORY_OBJECT_CONTROL_NULL; - if (pager == MEMORY_OBJECT_NULL) + if (pager == MEMORY_OBJECT_NULL) { return KERN_INVALID_ARGUMENT; + } - vm_object_cache_lock(); - entry = vm_object_hash_lookup(pager, FALSE); - if ((entry != VM_OBJECT_HASH_ENTRY_NULL) && - (entry->object != VM_OBJECT_NULL)) { - if (entry->object->named == TRUE) - panic("memory_object_create_named: caller already holds the right"); } - - vm_object_cache_unlock(); - if ((object = vm_object_enter(pager, size, FALSE, FALSE, TRUE)) - == VM_OBJECT_NULL) { - return(KERN_INVALID_OBJECT); + object = vm_object_memory_object_associate(pager, + VM_OBJECT_NULL, + size, + TRUE); + if (object == VM_OBJECT_NULL) { + return KERN_INVALID_OBJECT; } - + /* wait for object (if any) to be ready */ if (object != VM_OBJECT_NULL) { vm_object_lock(object); object->named = TRUE; while (!object->pager_ready) { vm_object_sleep(object, - VM_OBJECT_EVENT_PAGER_READY, - THREAD_UNINT); + VM_OBJECT_EVENT_PAGER_READY, + THREAD_UNINT); } *control = object->pager_control; vm_object_unlock(object); } - return (KERN_SUCCESS); + return KERN_SUCCESS; } @@ -4633,70 +5327,50 @@ memory_object_create_named( */ kern_return_t memory_object_recover_named( - memory_object_control_t control, - boolean_t wait_on_terminating) + memory_object_control_t control, + boolean_t wait_on_terminating) { - vm_object_t object; + vm_object_t object; - vm_object_cache_lock(); object = memory_object_control_to_vm_object(control); if (object == VM_OBJECT_NULL) { - vm_object_cache_unlock(); - return (KERN_INVALID_ARGUMENT); + return KERN_INVALID_ARGUMENT; } - restart: vm_object_lock(object); if (object->terminating && wait_on_terminating) { - vm_object_cache_unlock(); - vm_object_wait(object, - VM_OBJECT_EVENT_PAGING_IN_PROGRESS, - THREAD_UNINT); - vm_object_cache_lock(); + vm_object_wait(object, + VM_OBJECT_EVENT_PAGING_IN_PROGRESS, + THREAD_UNINT); goto restart; } if (!object->alive) { - vm_object_cache_unlock(); vm_object_unlock(object); return KERN_FAILURE; } if (object->named == TRUE) { - vm_object_cache_unlock(); vm_object_unlock(object); return KERN_SUCCESS; } - - if((object->ref_count == 0) && (!object->terminating)){ - queue_remove(&vm_object_cached_list, object, - vm_object_t, cached_list); - vm_object_cached_count--; - XPR(XPR_VM_OBJECT_CACHE, - "memory_object_recover_named: removing %X, head (%X, %X)\n", - (integer_t)object, - (integer_t)vm_object_cached_list.next, - (integer_t)vm_object_cached_list.prev, 0,0); - } - - vm_object_cache_unlock(); - object->named = TRUE; + vm_object_lock_assert_exclusive(object); object->ref_count++; vm_object_res_reference(object); while (!object->pager_ready) { vm_object_sleep(object, - VM_OBJECT_EVENT_PAGER_READY, - THREAD_UNINT); + VM_OBJECT_EVENT_PAGER_READY, + THREAD_UNINT); } vm_object_unlock(object); - return (KERN_SUCCESS); + return KERN_SUCCESS; } /* - * vm_object_release_name: + * vm_object_release_name: * * Enforces name semantic on memory_object reference count decrement * This routine should not be called unless the caller holds a name @@ -4707,33 +5381,26 @@ restart: * being the name. * If the decision is made to proceed the name field flag is set to * false and the reference count is decremented. If the RESPECT_CACHE - * flag is set and the reference count has gone to zero, the + * flag is set and the reference count has gone to zero, the * memory_object is checked to see if it is cacheable otherwise when * the reference count is zero, it is simply terminated. */ __private_extern__ kern_return_t vm_object_release_name( - vm_object_t object, - int flags) + vm_object_t object, + int flags) { - vm_object_t shadow; - boolean_t original_object = TRUE; + vm_object_t shadow; + boolean_t original_object = TRUE; while (object != VM_OBJECT_NULL) { - - /* - * The cache holds a reference (uncounted) to - * the object. We must locke it before removing - * the object. - * - */ - - vm_object_cache_lock(); vm_object_lock(object); + assert(object->alive); - if(original_object) + if (original_object) { assert(object->named); + } assert(object->ref_count > 0); /* @@ -4744,34 +5411,31 @@ vm_object_release_name( if (object->pager_created && !object->pager_initialized) { assert(!object->can_persist); vm_object_assert_wait(object, - VM_OBJECT_EVENT_INITIALIZED, - THREAD_UNINT); + VM_OBJECT_EVENT_INITIALIZED, + THREAD_UNINT); vm_object_unlock(object); - vm_object_cache_unlock(); thread_block(THREAD_CONTINUE_NULL); continue; } if (((object->ref_count > 1) - && (flags & MEMORY_OBJECT_TERMINATE_IDLE)) - || (object->terminating)) { + && (flags & MEMORY_OBJECT_TERMINATE_IDLE)) + || (object->terminating)) { vm_object_unlock(object); - vm_object_cache_unlock(); return KERN_FAILURE; } else { if (flags & MEMORY_OBJECT_RELEASE_NO_OP) { vm_object_unlock(object); - vm_object_cache_unlock(); return KERN_SUCCESS; } } - + if ((flags & MEMORY_OBJECT_RESPECT_CACHE) && - (object->ref_count == 1)) { - if(original_object) + (object->ref_count == 1)) { + if (original_object) { object->named = FALSE; + } vm_object_unlock(object); - vm_object_cache_unlock(); /* let vm_object_deallocate push this thing into */ /* the cache, if that it is where it is bound */ vm_object_deallocate(object); @@ -4779,9 +5443,10 @@ vm_object_release_name( } VM_OBJ_RES_DECR(object); shadow = object->pageout?VM_OBJECT_NULL:object->shadow; - if(object->ref_count == 1) { - if(vm_object_terminate(object) != KERN_SUCCESS) { - if(original_object) { + + if (object->ref_count == 1) { + if (vm_object_terminate(object) != KERN_SUCCESS) { + if (original_object) { return KERN_FAILURE; } else { return KERN_SUCCESS; @@ -4794,12 +5459,13 @@ vm_object_release_name( } return KERN_SUCCESS; } else { + vm_object_lock_assert_exclusive(object); object->ref_count--; assert(object->ref_count > 0); - if(original_object) + if (original_object) { object->named = FALSE; + } vm_object_unlock(object); - vm_object_cache_unlock(); return KERN_SUCCESS; } } @@ -4811,30 +5477,27 @@ vm_object_release_name( __private_extern__ kern_return_t vm_object_lock_request( - vm_object_t object, - vm_object_offset_t offset, - vm_object_size_t size, - memory_object_return_t should_return, - int flags, - vm_prot_t prot) + vm_object_t object, + vm_object_offset_t offset, + vm_object_size_t size, + memory_object_return_t should_return, + int flags, + vm_prot_t prot) { - __unused boolean_t should_flush; + __unused boolean_t should_flush; should_flush = flags & MEMORY_OBJECT_DATA_FLUSH; - XPR(XPR_MEMORY_OBJECT, - "vm_o_lock_request, obj 0x%X off 0x%X size 0x%X flags %X prot %X\n", - (integer_t)object, offset, size, - (((should_return&1)<<1)|should_flush), prot); - /* * Check for bogus arguments. */ - if (object == VM_OBJECT_NULL) - return (KERN_INVALID_ARGUMENT); + if (object == VM_OBJECT_NULL) { + return KERN_INVALID_ARGUMENT; + } - if ((prot & ~VM_PROT_ALL) != 0 && prot != VM_PROT_NO_CHANGE) - return (KERN_INVALID_ARGUMENT); + if ((prot & ~VM_PROT_ALL) != 0 && prot != VM_PROT_NO_CHANGE) { + return KERN_INVALID_ARGUMENT; + } size = round_page_64(size); @@ -4846,223 +5509,221 @@ vm_object_lock_request( vm_object_paging_begin(object); (void)vm_object_update(object, - offset, size, NULL, NULL, should_return, flags, prot); + offset, size, NULL, NULL, should_return, flags, prot); vm_object_paging_end(object); vm_object_unlock(object); - return (KERN_SUCCESS); + return KERN_SUCCESS; } /* - * Empty a purgable object by grabbing the physical pages assigned to it and + * Empty a purgeable object by grabbing the physical pages assigned to it and * putting them on the free queue without writing them to backing store, etc. * When the pages are next touched they will be demand zero-fill pages. We * skip pages which are busy, being paged in/out, wired, etc. We do _not_ * skip referenced/dirty pages, pages on the active queue, etc. We're more - * than happy to grab these since this is a purgable object. We mark the + * than happy to grab these since this is a purgeable object. We mark the * object as "empty" after reaping its pages. * - * On entry the object and page queues are locked, the object must be a - * purgable object with no delayed copies pending. + * On entry the object must be locked and it must be + * purgeable with no delayed copies pending. */ -unsigned int -vm_object_purge(vm_object_t object) +uint64_t +vm_object_purge(vm_object_t object, int flags) { - vm_page_t p, next; - unsigned int num_purged_pages; - vm_page_t local_freeq; - unsigned long local_freed; - int purge_loop_quota; -/* free pages as soon as we gather PURGE_BATCH_FREE_LIMIT pages to free */ -#define PURGE_BATCH_FREE_LIMIT 50 -/* release page queues lock every PURGE_LOOP_QUOTA iterations */ -#define PURGE_LOOP_QUOTA 100 - - num_purged_pages = 0; - if (object->purgable == VM_OBJECT_NONPURGABLE) - return num_purged_pages; - - object->purgable = VM_OBJECT_PURGABLE_EMPTY; + unsigned int object_page_count = 0, pgcount = 0; + uint64_t total_purged_pgcount = 0; + boolean_t skipped_object = FALSE; + + vm_object_lock_assert_exclusive(object); + + if (object->purgable == VM_PURGABLE_DENY) { + return 0; + } assert(object->copy == VM_OBJECT_NULL); assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE); - purge_loop_quota = PURGE_LOOP_QUOTA; - - local_freeq = VM_PAGE_NULL; - local_freed = 0; /* - * Go through the object's resident pages and try and discard them. + * We need to set the object's state to VM_PURGABLE_EMPTY *before* + * reaping its pages. We update vm_page_purgeable_count in bulk + * and we don't want vm_page_remove() to update it again for each + * page we reap later. + * + * For the purgeable ledgers, pages from VOLATILE and EMPTY objects + * are all accounted for in the "volatile" ledgers, so this does not + * make any difference. + * If we transitioned directly from NONVOLATILE to EMPTY, + * vm_page_purgeable_count must have been updated when the object + * was dequeued from its volatile queue and the purgeable ledgers + * must have also been updated accordingly at that time (in + * vm_object_purgable_control()). */ - next = (vm_page_t)queue_first(&object->memq); - while (!queue_end(&object->memq, (queue_entry_t)next)) { - p = next; - next = (vm_page_t)queue_next(&next->listq); - - if (purge_loop_quota-- == 0) { - /* - * Avoid holding the page queues lock for too long. - * Let someone else take it for a while if needed. - * Keep holding the object's lock to guarantee that - * the object's page list doesn't change under us - * while we yield. - */ - if (local_freeq != VM_PAGE_NULL) { - /* - * Flush our queue of pages to free. - */ - vm_page_free_list(local_freeq); - local_freeq = VM_PAGE_NULL; - local_freed = 0; - } - vm_page_unlock_queues(); - mutex_pause(); - vm_page_lock_queues(); - - /* resume with the current page and a new quota */ - purge_loop_quota = PURGE_LOOP_QUOTA; - } - - - if (p->busy || p->cleaning || p->laundry || - p->list_req_pending) { - /* page is being acted upon, so don't mess with it */ - continue; + if (object->purgable == VM_PURGABLE_VOLATILE) { + unsigned int delta; + assert(object->resident_page_count >= + object->wired_page_count); + delta = (object->resident_page_count - + object->wired_page_count); + if (delta != 0) { + assert(vm_page_purgeable_count >= + delta); + OSAddAtomic(-delta, + (SInt32 *)&vm_page_purgeable_count); } - if (p->wire_count) { - /* don't discard a wired page */ - continue; + if (object->wired_page_count != 0) { + assert(vm_page_purgeable_wired_count >= + object->wired_page_count); + OSAddAtomic(-object->wired_page_count, + (SInt32 *)&vm_page_purgeable_wired_count); } + object->purgable = VM_PURGABLE_EMPTY; + } + assert(object->purgable == VM_PURGABLE_EMPTY); - if (p->tabled) { - /* clean up the object/offset table */ - vm_page_remove(p); - } - if (p->absent) { - /* update the object's count of absent pages */ - vm_object_absent_release(object); - } + object_page_count = object->resident_page_count; - /* we can discard this page */ + vm_object_reap_pages(object, REAP_PURGEABLE); - /* advertize that this page is in a transition state */ - p->busy = TRUE; + if (object->resident_page_count >= object_page_count) { + total_purged_pgcount = 0; + } else { + total_purged_pgcount = object_page_count - object->resident_page_count; + } - if (p->no_isync == TRUE) { - /* the page hasn't been mapped yet */ - /* (optimization to delay the i-cache sync) */ - } else { - /* unmap the page */ - int refmod_state; + if (object->pager != NULL) { + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); - refmod_state = pmap_disconnect(p->phys_page); - if (refmod_state & VM_MEM_MODIFIED) { - p->dirty = TRUE; + if (object->activity_in_progress == 0 && + object->paging_in_progress == 0) { + /* + * Also reap any memory coming from this object + * in the VM compressor. + * + * There are no operations in progress on the VM object + * and no operation can start while we're holding the + * VM object lock, so it's safe to reap the compressed + * pages and update the page counts. + */ + pgcount = vm_compressor_pager_get_count(object->pager); + if (pgcount) { + pgcount = vm_compressor_pager_reap_pages(object->pager, flags); + vm_compressor_pager_count(object->pager, + -pgcount, + FALSE, /* shared */ + object); + vm_object_owner_compressed_update(object, + -pgcount); } + if (!(flags & C_DONT_BLOCK)) { + assert(vm_compressor_pager_get_count(object->pager) + == 0); + } + } else { + /* + * There's some kind of paging activity in progress + * for this object, which could result in a page + * being compressed or decompressed, possibly while + * the VM object is not locked, so it could race + * with us. + * + * We can't really synchronize this without possibly + * causing a deadlock when the compressor needs to + * allocate or free memory while compressing or + * decompressing a page from a purgeable object + * mapped in the kernel_map... + * + * So let's not attempt to purge the compressor + * pager if there's any kind of operation in + * progress on the VM object. + */ + skipped_object = TRUE; } + } - if (p->dirty || p->precious) { - /* we saved the cost of cleaning this page ! */ - num_purged_pages++; - vm_page_purged_count++; - } - - /* remove page from active or inactive queue... */ - VM_PAGE_QUEUES_REMOVE(p); + vm_object_lock_assert_exclusive(object); - /* ... and put it on our queue of pages to free */ - assert(!p->laundry); - assert(p->object != kernel_object); - assert(p->pageq.next == NULL && - p->pageq.prev == NULL); - p->pageq.next = (queue_entry_t) local_freeq; - local_freeq = p; - if (++local_freed >= PURGE_BATCH_FREE_LIMIT) { - /* flush our queue of pages to free */ - vm_page_free_list(local_freeq); - local_freeq = VM_PAGE_NULL; - local_freed = 0; - } - } + total_purged_pgcount += pgcount; - /* flush our local queue of pages to free one last time */ - if (local_freeq != VM_PAGE_NULL) { - vm_page_free_list(local_freeq); - local_freeq = VM_PAGE_NULL; - local_freed = 0; - } + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE_ONE)), + VM_KERNEL_UNSLIDE_OR_PERM(object), /* purged object */ + object_page_count, + total_purged_pgcount, + skipped_object, + 0); - return num_purged_pages; + return total_purged_pgcount; } + /* - * vm_object_purgable_control() allows the caller to control and investigate the - * state of a purgable object. A purgable object is created via a call to - * vm_allocate() with VM_FLAGS_PURGABLE specified. A purgable object will - * never be coalesced with any other object -- even other purgable objects -- - * and will thus always remain a distinct object. A purgable object has + * vm_object_purgeable_control() allows the caller to control and investigate the + * state of a purgeable object. A purgeable object is created via a call to + * vm_allocate() with VM_FLAGS_PURGABLE specified. A purgeable object will + * never be coalesced with any other object -- even other purgeable objects -- + * and will thus always remain a distinct object. A purgeable object has * special semantics when its reference count is exactly 1. If its reference - * count is greater than 1, then a purgable object will behave like a normal + * count is greater than 1, then a purgeable object will behave like a normal * object and attempts to use this interface will result in an error return * of KERN_INVALID_ARGUMENT. * - * A purgable object may be put into a "volatile" state which will make the + * A purgeable object may be put into a "volatile" state which will make the * object's pages elligable for being reclaimed without paging to backing * store if the system runs low on memory. If the pages in a volatile - * purgable object are reclaimed, the purgable object is said to have been - * "emptied." When a purgable object is emptied the system will reclaim as + * purgeable object are reclaimed, the purgeable object is said to have been + * "emptied." When a purgeable object is emptied the system will reclaim as * many pages from the object as it can in a convenient manner (pages already * en route to backing store or busy for other reasons are left as is). When - * a purgable object is made volatile, its pages will generally be reclaimed + * a purgeable object is made volatile, its pages will generally be reclaimed * before other pages in the application's working set. This semantic is * generally used by applications which can recreate the data in the object * faster than it can be paged in. One such example might be media assets * which can be reread from a much faster RAID volume. * - * A purgable object may be designated as "non-volatile" which means it will + * A purgeable object may be designated as "non-volatile" which means it will * behave like all other objects in the system with pages being written to and * read from backing store as needed to satisfy system memory needs. If the * object was emptied before the object was made non-volatile, that fact will - * be returned as the old state of the purgable object (see + * be returned as the old state of the purgeable object (see * VM_PURGABLE_SET_STATE below). In this case, any pages of the object which * were reclaimed as part of emptying the object will be refaulted in as * zero-fill on demand. It is up to the application to note that an object * was emptied and recreate the objects contents if necessary. When a - * purgable object is made non-volatile, its pages will generally not be paged - * out to backing store in the immediate future. A purgable object may also + * purgeable object is made non-volatile, its pages will generally not be paged + * out to backing store in the immediate future. A purgeable object may also * be manually emptied. * * Finally, the current state (non-volatile, volatile, volatile & empty) of a - * volatile purgable object may be queried at any time. This information may + * volatile purgeable object may be queried at any time. This information may * be used as a control input to let the application know when the system is * experiencing memory pressure and is reclaiming memory. * - * The specified address may be any address within the purgable object. If + * The specified address may be any address within the purgeable object. If * the specified address does not represent any object in the target task's * virtual address space, then KERN_INVALID_ADDRESS will be returned. If the - * object containing the specified address is not a purgable object, then + * object containing the specified address is not a purgeable object, then * KERN_INVALID_ARGUMENT will be returned. Otherwise, KERN_SUCCESS will be * returned. * * The control parameter may be any one of VM_PURGABLE_SET_STATE or * VM_PURGABLE_GET_STATE. For VM_PURGABLE_SET_STATE, the in/out parameter - * state is used to set the new state of the purgable object and return its - * old state. For VM_PURGABLE_GET_STATE, the current state of the purgable + * state is used to set the new state of the purgeable object and return its + * old state. For VM_PURGABLE_GET_STATE, the current state of the purgeable * object is returned in the parameter state. * * The in/out parameter state may be one of VM_PURGABLE_NONVOLATILE, * VM_PURGABLE_VOLATILE or VM_PURGABLE_EMPTY. These, respectively, represent * the non-volatile, volatile and volatile/empty states described above. - * Setting the state of a purgable object to VM_PURGABLE_EMPTY will + * Setting the state of a purgeable object to VM_PURGABLE_EMPTY will * immediately reclaim as many pages in the object as can be conveniently * collected (some may have already been written to backing store or be * otherwise busy). * - * The process of making a purgable object non-volatile and determining its - * previous state is atomic. Thus, if a purgable object is made + * The process of making a purgeable object non-volatile and determining its + * previous state is atomic. Thus, if a purgeable object is made * VM_PURGABLE_NONVOLATILE and the old state is returned as - * VM_PURGABLE_VOLATILE, then the purgable object's previous contents are + * VM_PURGABLE_VOLATILE, then the purgeable object's previous contents are * completely intact and will remain so until the object is made volatile * again. If the old state is returned as VM_PURGABLE_EMPTY then the object * was reclaimed while it was in a volatile state and its previous contents @@ -5073,48 +5734,32 @@ vm_object_purge(vm_object_t object) */ kern_return_t vm_object_purgable_control( - vm_object_t object, - vm_purgable_t control, - int *state) + vm_object_t object, + vm_purgable_t control, + int *state) { - int old_state; - vm_page_t p; + int old_state; + int new_state; if (object == VM_OBJECT_NULL) { /* - * Object must already be present or it can't be purgable. + * Object must already be present or it can't be purgeable. */ return KERN_INVALID_ARGUMENT; } + vm_object_lock_assert_exclusive(object); + /* - * Get current state of the purgable object. + * Get current state of the purgeable object. */ - switch (object->purgable) { - case VM_OBJECT_NONPURGABLE: + old_state = object->purgable; + if (old_state == VM_PURGABLE_DENY) { return KERN_INVALID_ARGUMENT; - - case VM_OBJECT_PURGABLE_NONVOLATILE: - old_state = VM_PURGABLE_NONVOLATILE; - break; - - case VM_OBJECT_PURGABLE_VOLATILE: - old_state = VM_PURGABLE_VOLATILE; - break; - - case VM_OBJECT_PURGABLE_EMPTY: - old_state = VM_PURGABLE_EMPTY; - break; - - default: - old_state = VM_PURGABLE_NONVOLATILE; - panic("Bad state (%d) for purgable object!\n", - object->purgable); - /*NOTREACHED*/ } - /* purgable cant have delayed copies - now or in the future */ - assert(object->copy == VM_OBJECT_NULL); + /* purgeable cant have delayed copies - now or in the future */ + assert(object->copy == VM_OBJECT_NULL); assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE); /* @@ -5125,95 +5770,380 @@ vm_object_purgable_control( return KERN_SUCCESS; } - switch (*state) { - case VM_PURGABLE_NONVOLATILE: - vm_page_lock_queues(); - if (object->purgable != VM_OBJECT_PURGABLE_NONVOLATILE) { - assert(vm_page_purgeable_count >= - object->resident_page_count); - vm_page_purgeable_count -= object->resident_page_count; - } + if (control == VM_PURGABLE_SET_STATE && + object->purgeable_only_by_kernel) { + return KERN_PROTECTION_FAILURE; + } + + if (control != VM_PURGABLE_SET_STATE && + control != VM_PURGABLE_SET_STATE_FROM_KERNEL) { + return KERN_INVALID_ARGUMENT; + } + + if ((*state) & VM_PURGABLE_DEBUG_EMPTY) { + object->volatile_empty = TRUE; + } + if ((*state) & VM_PURGABLE_DEBUG_FAULT) { + object->volatile_fault = TRUE; + } - object->purgable = VM_OBJECT_PURGABLE_NONVOLATILE; + new_state = *state & VM_PURGABLE_STATE_MASK; + if (new_state == VM_PURGABLE_VOLATILE) { + if (old_state == VM_PURGABLE_EMPTY) { + /* what's been emptied must stay empty */ + new_state = VM_PURGABLE_EMPTY; + } + if (object->volatile_empty) { + /* debugging mode: go straight to empty */ + new_state = VM_PURGABLE_EMPTY; + } + } + switch (new_state) { + case VM_PURGABLE_DENY: /* - * If the object wasn't emptied, then mark all pages of the - * object as referenced in order to give them a complete turn - * of the virtual memory "clock" before becoming candidates - * for paging out (if the system is suffering from memory - * pressure). We don't really need to set the pmap reference - * bits (which would be expensive) since the software copies - * are believed if they're set to true ... + * Attempting to convert purgeable memory to non-purgeable: + * not allowed. */ - if (old_state != VM_PURGABLE_EMPTY) { - for (p = (vm_page_t)queue_first(&object->memq); - !queue_end(&object->memq, (queue_entry_t)p); - p = (vm_page_t)queue_next(&p->listq)) - p->reference = TRUE; - } + return KERN_INVALID_ARGUMENT; + case VM_PURGABLE_NONVOLATILE: + object->purgable = new_state; - vm_page_unlock_queues(); + if (old_state == VM_PURGABLE_VOLATILE) { + unsigned int delta; + + assert(object->resident_page_count >= + object->wired_page_count); + delta = (object->resident_page_count - + object->wired_page_count); + + assert(vm_page_purgeable_count >= delta); + + if (delta != 0) { + OSAddAtomic(-delta, + (SInt32 *)&vm_page_purgeable_count); + } + if (object->wired_page_count != 0) { + assert(vm_page_purgeable_wired_count >= + object->wired_page_count); + OSAddAtomic(-object->wired_page_count, + (SInt32 *)&vm_page_purgeable_wired_count); + } + + vm_page_lock_queues(); + + /* object should be on a queue */ + assert(object->objq.next != NULL && + object->objq.prev != NULL); + purgeable_q_t queue; + + /* + * Move object from its volatile queue to the + * non-volatile queue... + */ + queue = vm_purgeable_object_remove(object); + assert(queue); + + if (object->purgeable_when_ripe) { + vm_purgeable_token_delete_last(queue); + } + assert(queue->debug_count_objects >= 0); + + vm_page_unlock_queues(); + } + if (old_state == VM_PURGABLE_VOLATILE || + old_state == VM_PURGABLE_EMPTY) { + /* + * Transfer the object's pages from the volatile to + * non-volatile ledgers. + */ + vm_purgeable_accounting(object, VM_PURGABLE_VOLATILE); + } break; case VM_PURGABLE_VOLATILE: - vm_page_lock_queues(); + if (object->volatile_fault) { + vm_page_t p; + int refmod; + + vm_page_queue_iterate(&object->memq, p, vmp_listq) { + if (p->vmp_busy || + VM_PAGE_WIRED(p) || + p->vmp_fictitious) { + continue; + } + refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); + if ((refmod & VM_MEM_MODIFIED) && + !p->vmp_dirty) { + SET_PAGE_DIRTY(p, FALSE); + } + } + } + + assert(old_state != VM_PURGABLE_EMPTY); - if (object->purgable != VM_OBJECT_PURGABLE_VOLATILE && - object->purgable != VM_OBJECT_PURGABLE_EMPTY) { - vm_page_purgeable_count += object->resident_page_count; + purgeable_q_t queue; + + /* find the correct queue */ + if ((*state & VM_PURGABLE_ORDERING_MASK) == VM_PURGABLE_ORDERING_OBSOLETE) { + queue = &purgeable_queues[PURGEABLE_Q_TYPE_OBSOLETE]; + } else { + if ((*state & VM_PURGABLE_BEHAVIOR_MASK) == VM_PURGABLE_BEHAVIOR_FIFO) { + queue = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO]; + } else { + queue = &purgeable_queues[PURGEABLE_Q_TYPE_LIFO]; + } } - object->purgable = VM_OBJECT_PURGABLE_VOLATILE; + if (old_state == VM_PURGABLE_NONVOLATILE || + old_state == VM_PURGABLE_EMPTY) { + unsigned int delta; - /* - * We want the newly volatile purgable object to be a - * candidate for the pageout scan before other pages in the - * application if the system is suffering from memory - * pressure. To do this, we move a page of the object from - * the active queue onto the inactive queue in order to - * promote the object for early reclaim. We only need to move - * a single page since the pageout scan will reap the entire - * purgable object if it finds a single page in a volatile - * state. Obviously we don't do this if there are no pages - * associated with the object or we find a page of the object - * already on the inactive queue. - */ - for (p = (vm_page_t)queue_first(&object->memq); - !queue_end(&object->memq, (queue_entry_t)p); - p = (vm_page_t)queue_next(&p->listq)) { - if (p->inactive) { - /* already a page on the inactive queue */ - break; + if ((*state & VM_PURGABLE_NO_AGING_MASK) == + VM_PURGABLE_NO_AGING) { + object->purgeable_when_ripe = FALSE; + } else { + object->purgeable_when_ripe = TRUE; } - if (p->active && !p->busy) { - /* found one we can move */ - vm_page_deactivate(p); - break; + + if (object->purgeable_when_ripe) { + kern_return_t result; + + /* try to add token... this can fail */ + vm_page_lock_queues(); + + result = vm_purgeable_token_add(queue); + if (result != KERN_SUCCESS) { + vm_page_unlock_queues(); + return result; + } + vm_page_unlock_queues(); + } + + assert(object->resident_page_count >= + object->wired_page_count); + delta = (object->resident_page_count - + object->wired_page_count); + + if (delta != 0) { + OSAddAtomic(delta, + &vm_page_purgeable_count); + } + if (object->wired_page_count != 0) { + OSAddAtomic(object->wired_page_count, + &vm_page_purgeable_wired_count); + } + + object->purgable = new_state; + + /* object should be on "non-volatile" queue */ + assert(object->objq.next != NULL); + assert(object->objq.prev != NULL); + } else if (old_state == VM_PURGABLE_VOLATILE) { + purgeable_q_t old_queue; + boolean_t purgeable_when_ripe; + + /* + * if reassigning priorities / purgeable groups, we don't change the + * token queue. So moving priorities will not make pages stay around longer. + * Reasoning is that the algorithm gives most priority to the most important + * object. If a new token is added, the most important object' priority is boosted. + * This biases the system already for purgeable queues that move a lot. + * It doesn't seem more biasing is neccessary in this case, where no new object is added. + */ + assert(object->objq.next != NULL && object->objq.prev != NULL); /* object should be on a queue */ + + old_queue = vm_purgeable_object_remove(object); + assert(old_queue); + + if ((*state & VM_PURGABLE_NO_AGING_MASK) == + VM_PURGABLE_NO_AGING) { + purgeable_when_ripe = FALSE; + } else { + purgeable_when_ripe = TRUE; + } + + if (old_queue != queue || + (purgeable_when_ripe != + object->purgeable_when_ripe)) { + kern_return_t result; + + /* Changing queue. Have to move token. */ + vm_page_lock_queues(); + if (object->purgeable_when_ripe) { + vm_purgeable_token_delete_last(old_queue); + } + object->purgeable_when_ripe = purgeable_when_ripe; + if (object->purgeable_when_ripe) { + result = vm_purgeable_token_add(queue); + assert(result == KERN_SUCCESS); /* this should never fail since we just freed a token */ + } + vm_page_unlock_queues(); } } - vm_page_unlock_queues(); + ; + vm_purgeable_object_add(object, queue, (*state & VM_VOLATILE_GROUP_MASK) >> VM_VOLATILE_GROUP_SHIFT ); + if (old_state == VM_PURGABLE_NONVOLATILE) { + vm_purgeable_accounting(object, + VM_PURGABLE_NONVOLATILE); + } + + assert(queue->debug_count_objects >= 0); break; case VM_PURGABLE_EMPTY: - vm_page_lock_queues(); - if (object->purgable != VM_OBJECT_PURGABLE_VOLATILE && - object->purgable != VM_OBJECT_PURGABLE_EMPTY) { - vm_page_purgeable_count += object->resident_page_count; + if (object->volatile_fault) { + vm_page_t p; + int refmod; + + vm_page_queue_iterate(&object->memq, p, vmp_listq) { + if (p->vmp_busy || + VM_PAGE_WIRED(p) || + p->vmp_fictitious) { + continue; + } + refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); + if ((refmod & VM_MEM_MODIFIED) && + !p->vmp_dirty) { + SET_PAGE_DIRTY(p, FALSE); + } + } + } + + if (old_state == VM_PURGABLE_VOLATILE) { + purgeable_q_t old_queue; + + /* object should be on a queue */ + assert(object->objq.next != NULL && + object->objq.prev != NULL); + + old_queue = vm_purgeable_object_remove(object); + assert(old_queue); + if (object->purgeable_when_ripe) { + vm_page_lock_queues(); + vm_purgeable_token_delete_first(old_queue); + vm_page_unlock_queues(); + } + } + + if (old_state == VM_PURGABLE_NONVOLATILE) { + /* + * This object's pages were previously accounted as + * "non-volatile" and now need to be accounted as + * "volatile". + */ + vm_purgeable_accounting(object, + VM_PURGABLE_NONVOLATILE); + /* + * Set to VM_PURGABLE_EMPTY because the pages are no + * longer accounted in the "non-volatile" ledger + * and are also not accounted for in + * "vm_page_purgeable_count". + */ + object->purgable = VM_PURGABLE_EMPTY; } - (void) vm_object_purge(object); - vm_page_unlock_queues(); - break; + (void) vm_object_purge(object, 0); + assert(object->purgable == VM_PURGABLE_EMPTY); + + break; } + *state = old_state; + vm_object_lock_assert_exclusive(object); + return KERN_SUCCESS; } -#if TASK_SWAPPER +kern_return_t +vm_object_get_page_counts( + vm_object_t object, + vm_object_offset_t offset, + vm_object_size_t size, + unsigned int *resident_page_count, + unsigned int *dirty_page_count) +{ + kern_return_t kr = KERN_SUCCESS; + boolean_t count_dirty_pages = FALSE; + vm_page_t p = VM_PAGE_NULL; + unsigned int local_resident_count = 0; + unsigned int local_dirty_count = 0; + vm_object_offset_t cur_offset = 0; + vm_object_offset_t end_offset = 0; + + if (object == VM_OBJECT_NULL) { + return KERN_INVALID_ARGUMENT; + } + + + cur_offset = offset; + + end_offset = offset + size; + + vm_object_lock_assert_exclusive(object); + + if (dirty_page_count != NULL) { + count_dirty_pages = TRUE; + } + + if (resident_page_count != NULL && count_dirty_pages == FALSE) { + /* + * Fast path when: + * - we only want the resident page count, and, + * - the entire object is exactly covered by the request. + */ + if (offset == 0 && (object->vo_size == size)) { + *resident_page_count = object->resident_page_count; + goto out; + } + } + + if (object->resident_page_count <= (size >> PAGE_SHIFT)) { + vm_page_queue_iterate(&object->memq, p, vmp_listq) { + if (p->vmp_offset >= cur_offset && p->vmp_offset < end_offset) { + local_resident_count++; + + if (count_dirty_pages) { + if (p->vmp_dirty || (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { + local_dirty_count++; + } + } + } + } + } else { + for (cur_offset = offset; cur_offset < end_offset; cur_offset += PAGE_SIZE_64) { + p = vm_page_lookup(object, cur_offset); + + if (p != VM_PAGE_NULL) { + local_resident_count++; + + if (count_dirty_pages) { + if (p->vmp_dirty || (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { + local_dirty_count++; + } + } + } + } + } + + if (resident_page_count != NULL) { + *resident_page_count = local_resident_count; + } + + if (dirty_page_count != NULL) { + *dirty_page_count = local_dirty_count; + } + +out: + return kr; +} + + +#if TASK_SWAPPER /* * vm_object_res_deallocate * @@ -5229,7 +6159,7 @@ vm_object_purgable_control( __private_extern__ void vm_object_res_deallocate( - vm_object_t object) + vm_object_t object) { vm_object_t orig_object = object; /* @@ -5238,22 +6168,25 @@ vm_object_res_deallocate( * unlocked. */ assert(object->res_count > 0); - while (--object->res_count == 0) { + while (--object->res_count == 0) { assert(object->ref_count >= object->res_count); vm_object_deactivate_all_pages(object); /* iterate on shadow, if present */ if (object->shadow != VM_OBJECT_NULL) { vm_object_t tmp_object = object->shadow; vm_object_lock(tmp_object); - if (object != orig_object) + if (object != orig_object) { vm_object_unlock(object); + } object = tmp_object; assert(object->res_count > 0); - } else + } else { break; + } } - if (object != orig_object) + if (object != orig_object) { vm_object_unlock(object); + } } /* @@ -5272,28 +6205,30 @@ vm_object_res_deallocate( __private_extern__ void vm_object_res_reference( - vm_object_t object) + vm_object_t object) { vm_object_t orig_object = object; - /* + /* * Object is locked, so this can be called directly * from vm_object_reference. This lock is never released. */ - while ((++object->res_count == 1) && - (object->shadow != VM_OBJECT_NULL)) { + while ((++object->res_count == 1) && + (object->shadow != VM_OBJECT_NULL)) { vm_object_t tmp_object = object->shadow; assert(object->ref_count >= object->res_count); vm_object_lock(tmp_object); - if (object != orig_object) + if (object != orig_object) { vm_object_unlock(object); + } object = tmp_object; } - if (object != orig_object) + if (object != orig_object) { vm_object_unlock(object); + } assert(orig_object->ref_count >= orig_object->res_count); } -#endif /* TASK_SWAPPER */ +#endif /* TASK_SWAPPER */ /* * vm_object_reference: @@ -5305,10 +6240,11 @@ vm_object_res_reference( #endif __private_extern__ void vm_object_reference( - register vm_object_t object) + vm_object_t object) { - if (object == VM_OBJECT_NULL) + if (object == VM_OBJECT_NULL) { return; + } vm_object_lock(object); assert(object->ref_count > 0); @@ -5316,29 +6252,6 @@ vm_object_reference( vm_object_unlock(object); } -#ifdef MACH_BSD -/* - * Scale the vm_object_cache - * This is required to make sure that the vm_object_cache is big - * enough to effectively cache the mapped file. - * This is really important with UBC as all the regular file vnodes - * have memory object associated with them. Havving this cache too - * small results in rapid reclaim of vnodes and hurts performance a LOT! - * - * This is also needed as number of vnodes can be dynamically scaled. - */ -kern_return_t -adjust_vm_object_cache( - __unused vm_size_t oval, - vm_size_t nval) -{ - vm_object_cached_max = nval; - vm_object_cache_trim(FALSE); - return (KERN_SUCCESS); -} -#endif /* MACH_BSD */ - - /* * vm_object_transpose * @@ -5349,22 +6262,21 @@ adjust_vm_object_cache( * * The VM objects must not be locked by caller. */ +unsigned int vm_object_transpose_count = 0; kern_return_t vm_object_transpose( - vm_object_t object1, - vm_object_t object2, - vm_object_size_t transpose_size) + vm_object_t object1, + vm_object_t object2, + vm_object_size_t transpose_size) { - vm_object_t tmp_object; - kern_return_t retval; - boolean_t object1_locked, object2_locked; - boolean_t object1_paging, object2_paging; - vm_page_t page; - vm_object_offset_t page_offset; + vm_object_t tmp_object; + kern_return_t retval; + boolean_t object1_locked, object2_locked; + vm_page_t page; + vm_object_offset_t page_offset; tmp_object = VM_OBJECT_NULL; object1_locked = FALSE; object2_locked = FALSE; - object1_paging = FALSE; object2_paging = FALSE; if (object1 == object2 || object1 == VM_OBJECT_NULL || @@ -5377,10 +6289,34 @@ vm_object_transpose( goto done; } + /* + * Since we need to lock both objects at the same time, + * make sure we always lock them in the same order to + * avoid deadlocks. + */ + if (object1 > object2) { + tmp_object = object1; + object1 = object2; + object2 = tmp_object; + } + + /* + * Allocate a temporary VM object to hold object1's contents + * while we copy object2 to object1. + */ + tmp_object = vm_object_allocate(transpose_size); + vm_object_lock(tmp_object); + tmp_object->can_persist = FALSE; + + + /* + * Grab control of the 1st VM object. + */ vm_object_lock(object1); object1_locked = TRUE; - if (object1->copy || object1->shadow || object1->shadowed || - object1->purgable != VM_OBJECT_NONPURGABLE) { + if (!object1->alive || object1->terminating || + object1->copy || object1->shadow || object1->shadowed || + object1->purgable != VM_PURGABLE_DENY) { /* * We don't deal with copy or shadow objects (yet). */ @@ -5388,65 +6324,42 @@ vm_object_transpose( goto done; } /* - * Since we're about to mess with the object's backing store, - * mark it as "paging_in_progress". Note that this is not enough + * We're about to mess with the object's backing store and + * taking a "paging_in_progress" reference wouldn't be enough * to prevent any paging activity on this object, so the caller should * have "quiesced" the objects beforehand, via a UPL operation with * UPL_SET_IO_WIRE (to make sure all the pages are there and wired) * and UPL_BLOCK_ACCESS (to mark the pages "busy"). + * + * Wait for any paging operation to complete (but only paging, not + * other kind of activities not linked to the pager). After we're + * statisfied that there's no more paging in progress, we keep the + * object locked, to guarantee that no one tries to access its pager. */ - vm_object_paging_begin(object1); - object1_paging = TRUE; - vm_object_unlock(object1); - object1_locked = FALSE; + vm_object_paging_only_wait(object1, THREAD_UNINT); /* * Same as above for the 2nd object... */ vm_object_lock(object2); object2_locked = TRUE; - if (object2->copy || object2->shadow || object2->shadowed || - object2->purgable != VM_OBJECT_NONPURGABLE) { + if (!object2->alive || object2->terminating || + object2->copy || object2->shadow || object2->shadowed || + object2->purgable != VM_PURGABLE_DENY) { retval = KERN_INVALID_VALUE; goto done; } - vm_object_paging_begin(object2); - object2_paging = TRUE; - vm_object_unlock(object2); - object2_locked = FALSE; - - /* - * Allocate a temporary VM object to hold object1's contents - * while we copy object2 to object1. - */ - tmp_object = vm_object_allocate(transpose_size); - vm_object_lock(tmp_object); - vm_object_paging_begin(tmp_object); - tmp_object->can_persist = FALSE; + vm_object_paging_only_wait(object2, THREAD_UNINT); - /* - * Since we need to lock both objects at the same time, - * make sure we always lock them in the same order to - * avoid deadlocks. - */ - if (object1 < object2) { - vm_object_lock(object1); - vm_object_lock(object2); - } else { - vm_object_lock(object2); - vm_object_lock(object1); - } - object1_locked = TRUE; - object2_locked = TRUE; - if (object1->size != object2->size || - object1->size != transpose_size) { + if (object1->vo_size != object2->vo_size || + object1->vo_size != transpose_size) { /* * If the 2 objects don't have the same size, we can't * exchange their backing stores or one would overflow. * If their size doesn't match the caller's * "transpose_size", we can't do it either because the - * transpose operation will affect the entire span of + * transpose operation will affect the entire span of * the objects. */ retval = KERN_INVALID_VALUE; @@ -5456,110 +6369,190 @@ vm_object_transpose( /* * Transpose the lists of resident pages. + * This also updates the resident_page_count and the memq_hint. */ - if (object1->phys_contiguous || queue_empty(&object1->memq)) { + if (object1->phys_contiguous || vm_page_queue_empty(&object1->memq)) { /* * No pages in object1, just transfer pages * from object2 to object1. No need to go through * an intermediate object. */ - while (!queue_empty(&object2->memq)) { - page = (vm_page_t) queue_first(&object2->memq); - vm_page_rename(page, object1, page->offset); + while (!vm_page_queue_empty(&object2->memq)) { + page = (vm_page_t) vm_page_queue_first(&object2->memq); + vm_page_rename(page, object1, page->vmp_offset); } - assert(queue_empty(&object2->memq)); - } else if (object2->phys_contiguous || queue_empty(&object2->memq)) { + assert(vm_page_queue_empty(&object2->memq)); + } else if (object2->phys_contiguous || vm_page_queue_empty(&object2->memq)) { /* * No pages in object2, just transfer pages * from object1 to object2. No need to go through * an intermediate object. */ - while (!queue_empty(&object1->memq)) { - page = (vm_page_t) queue_first(&object1->memq); - vm_page_rename(page, object2, page->offset); + while (!vm_page_queue_empty(&object1->memq)) { + page = (vm_page_t) vm_page_queue_first(&object1->memq); + vm_page_rename(page, object2, page->vmp_offset); } - assert(queue_empty(&object1->memq)); + assert(vm_page_queue_empty(&object1->memq)); } else { /* transfer object1's pages to tmp_object */ - vm_page_lock_queues(); - while (!queue_empty(&object1->memq)) { - page = (vm_page_t) queue_first(&object1->memq); - page_offset = page->offset; - vm_page_remove(page); - page->offset = page_offset; - queue_enter(&tmp_object->memq, page, vm_page_t, listq); + while (!vm_page_queue_empty(&object1->memq)) { + page = (vm_page_t) vm_page_queue_first(&object1->memq); + page_offset = page->vmp_offset; + vm_page_remove(page, TRUE); + page->vmp_offset = page_offset; + vm_page_queue_enter(&tmp_object->memq, page, vmp_listq); } - vm_page_unlock_queues(); - assert(queue_empty(&object1->memq)); + assert(vm_page_queue_empty(&object1->memq)); /* transfer object2's pages to object1 */ - while (!queue_empty(&object2->memq)) { - page = (vm_page_t) queue_first(&object2->memq); - vm_page_rename(page, object1, page->offset); + while (!vm_page_queue_empty(&object2->memq)) { + page = (vm_page_t) vm_page_queue_first(&object2->memq); + vm_page_rename(page, object1, page->vmp_offset); } - assert(queue_empty(&object2->memq)); - /* transfer tmp_object's pages to object1 */ - while (!queue_empty(&tmp_object->memq)) { - page = (vm_page_t) queue_first(&tmp_object->memq); - queue_remove(&tmp_object->memq, page, - vm_page_t, listq); - vm_page_insert(page, object2, page->offset); + assert(vm_page_queue_empty(&object2->memq)); + /* transfer tmp_object's pages to object2 */ + while (!vm_page_queue_empty(&tmp_object->memq)) { + page = (vm_page_t) vm_page_queue_first(&tmp_object->memq); + vm_page_queue_remove(&tmp_object->memq, page, vmp_listq); + vm_page_insert(page, object2, page->vmp_offset); } - assert(queue_empty(&tmp_object->memq)); + assert(vm_page_queue_empty(&tmp_object->memq)); } - /* no need to transpose the size: they should be identical */ - assert(object1->size == object2->size); - -#define __TRANSPOSE_FIELD(field) \ -MACRO_BEGIN \ - tmp_object->field = object1->field; \ - object1->field = object2->field; \ - object2->field = tmp_object->field; \ +#define __TRANSPOSE_FIELD(field) \ +MACRO_BEGIN \ + tmp_object->field = object1->field; \ + object1->field = object2->field; \ + object2->field = tmp_object->field; \ MACRO_END + /* "Lock" refers to the object not its contents */ + /* "size" should be identical */ + assert(object1->vo_size == object2->vo_size); + /* "memq_hint" was updated above when transposing pages */ + /* "ref_count" refers to the object not its contents */ + assert(object1->ref_count >= 1); + assert(object2->ref_count >= 1); +#if TASK_SWAPPER + /* "res_count" refers to the object not its contents */ +#endif + /* "resident_page_count" was updated above when transposing pages */ + /* "wired_page_count" was updated above when transposing pages */ +#if !VM_TAG_ACTIVE_UPDATE + /* "wired_objq" was dealt with along with "wired_page_count" */ +#endif /* ! VM_TAG_ACTIVE_UPDATE */ + /* "reusable_page_count" was updated above when transposing pages */ + /* there should be no "copy" */ assert(!object1->copy); assert(!object2->copy); - + /* there should be no "shadow" */ assert(!object1->shadow); assert(!object2->shadow); - - __TRANSPOSE_FIELD(shadow_offset); /* used by phys_contiguous objects */ + __TRANSPOSE_FIELD(vo_shadow_offset); /* used by phys_contiguous objects */ __TRANSPOSE_FIELD(pager); __TRANSPOSE_FIELD(paging_offset); - __TRANSPOSE_FIELD(pager_control); /* update the memory_objects' pointers back to the VM objects */ if (object1->pager_control != MEMORY_OBJECT_CONTROL_NULL) { memory_object_control_collapse(object1->pager_control, - object1); + object1); } if (object2->pager_control != MEMORY_OBJECT_CONTROL_NULL) { memory_object_control_collapse(object2->pager_control, - object2); + object2); } - - __TRANSPOSE_FIELD(absent_count); - - assert(object1->paging_in_progress); - assert(object2->paging_in_progress); - + __TRANSPOSE_FIELD(copy_strategy); + /* "paging_in_progress" refers to the object not its contents */ + assert(!object1->paging_in_progress); + assert(!object2->paging_in_progress); + assert(object1->activity_in_progress); + assert(object2->activity_in_progress); + /* "all_wanted" refers to the object not its contents */ __TRANSPOSE_FIELD(pager_created); __TRANSPOSE_FIELD(pager_initialized); __TRANSPOSE_FIELD(pager_ready); __TRANSPOSE_FIELD(pager_trusted); + __TRANSPOSE_FIELD(can_persist); __TRANSPOSE_FIELD(internal); - __TRANSPOSE_FIELD(temporary); __TRANSPOSE_FIELD(private); __TRANSPOSE_FIELD(pageout); + /* "alive" should be set */ + assert(object1->alive); + assert(object2->alive); + /* "purgeable" should be non-purgeable */ + assert(object1->purgable == VM_PURGABLE_DENY); + assert(object2->purgable == VM_PURGABLE_DENY); + /* "shadowed" refers to the the object not its contents */ + __TRANSPOSE_FIELD(purgeable_when_ripe); __TRANSPOSE_FIELD(true_share); + /* "terminating" should not be set */ + assert(!object1->terminating); + assert(!object2->terminating); + /* transfer "named" reference if needed */ + if (object1->named && !object2->named) { + assert(object1->ref_count >= 2); + assert(object2->ref_count >= 1); + object1->ref_count--; + object2->ref_count++; + } else if (!object1->named && object2->named) { + assert(object1->ref_count >= 1); + assert(object2->ref_count >= 2); + object1->ref_count++; + object2->ref_count--; + } + __TRANSPOSE_FIELD(named); + /* "shadow_severed" refers to the object not its contents */ __TRANSPOSE_FIELD(phys_contiguous); __TRANSPOSE_FIELD(nophyscache); + /* "cached_list.next" points to transposed object */ + object1->cached_list.next = (queue_entry_t) object2; + object2->cached_list.next = (queue_entry_t) object1; + /* "cached_list.prev" should be NULL */ + assert(object1->cached_list.prev == NULL); + assert(object2->cached_list.prev == NULL); __TRANSPOSE_FIELD(last_alloc); __TRANSPOSE_FIELD(sequential); - __TRANSPOSE_FIELD(cluster_size); - __TRANSPOSE_FIELD(existence_map); + __TRANSPOSE_FIELD(pages_created); + __TRANSPOSE_FIELD(pages_used); + __TRANSPOSE_FIELD(scan_collisions); __TRANSPOSE_FIELD(cow_hint); __TRANSPOSE_FIELD(wimg_bits); + __TRANSPOSE_FIELD(set_cache_attr); + __TRANSPOSE_FIELD(code_signed); + object1->transposed = TRUE; + object2->transposed = TRUE; + __TRANSPOSE_FIELD(mapping_in_progress); + __TRANSPOSE_FIELD(volatile_empty); + __TRANSPOSE_FIELD(volatile_fault); + __TRANSPOSE_FIELD(all_reusable); + assert(object1->blocked_access); + assert(object2->blocked_access); + __TRANSPOSE_FIELD(set_cache_attr); + assert(!object1->object_is_shared_cache); + assert(!object2->object_is_shared_cache); + /* ignore purgeable_queue_type and purgeable_queue_group */ + assert(!object1->io_tracking); + assert(!object2->io_tracking); +#if VM_OBJECT_ACCESS_TRACKING + assert(!object1->access_tracking); + assert(!object2->access_tracking); +#endif /* VM_OBJECT_ACCESS_TRACKING */ + __TRANSPOSE_FIELD(no_tag_update); +#if CONFIG_SECLUDED_MEMORY + assert(!object1->eligible_for_secluded); + assert(!object2->eligible_for_secluded); + assert(!object1->can_grab_secluded); + assert(!object2->can_grab_secluded); +#else /* CONFIG_SECLUDED_MEMORY */ + assert(object1->__object3_unused_bits == 0); + assert(object2->__object3_unused_bits == 0); +#endif /* CONFIG_SECLUDED_MEMORY */ +#if UPL_DEBUG + /* "uplq" refers to the object not its contents (see upl_transpose()) */ +#endif + assert((object1->purgable == VM_PURGABLE_DENY) || (object1->objq.next == NULL)); + assert((object1->purgable == VM_PURGABLE_DENY) || (object1->objq.prev == NULL)); + assert((object2->purgable == VM_PURGABLE_DENY) || (object2->objq.next == NULL)); + assert((object2->purgable == VM_PURGABLE_DENY) || (object2->objq.prev == NULL)); #undef __TRANSPOSE_FIELD @@ -5570,7 +6563,6 @@ done: * Cleanup. */ if (tmp_object != VM_OBJECT_NULL) { - vm_object_paging_end(tmp_object); vm_object_unlock(tmp_object); /* * Re-initialize the temporary object to avoid @@ -5589,18 +6581,2033 @@ done: vm_object_unlock(object2); object2_locked = FALSE; } - if (object1_paging) { - vm_object_lock(object1); - vm_object_paging_end(object1); - vm_object_unlock(object1); - object1_paging = FALSE; + + vm_object_transpose_count++; + + return retval; +} + + +/* + * vm_object_cluster_size + * + * Determine how big a cluster we should issue an I/O for... + * + * Inputs: *start == offset of page needed + * *length == maximum cluster pager can handle + * Outputs: *start == beginning offset of cluster + * *length == length of cluster to try + * + * The original *start will be encompassed by the cluster + * + */ +extern int speculative_reads_disabled; + +/* + * Try to always keep these values an even multiple of PAGE_SIZE. We use these values + * to derive min_ph_bytes and max_ph_bytes (IMP: bytes not # of pages) and expect those values to + * always be page-aligned. The derivation could involve operations (e.g. division) + * that could give us non-page-size aligned values if we start out with values that + * are odd multiples of PAGE_SIZE. + */ +#if CONFIG_EMBEDDED +unsigned int preheat_max_bytes = (1024 * 512); +#else /* CONFIG_EMBEDDED */ +unsigned int preheat_max_bytes = MAX_UPL_TRANSFER_BYTES; +#endif /* CONFIG_EMBEDDED */ +unsigned int preheat_min_bytes = (1024 * 32); + + +__private_extern__ void +vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start, + vm_size_t *length, vm_object_fault_info_t fault_info, uint32_t *io_streaming) +{ + vm_size_t pre_heat_size; + vm_size_t tail_size; + vm_size_t head_size; + vm_size_t max_length; + vm_size_t cluster_size; + vm_object_offset_t object_size; + vm_object_offset_t orig_start; + vm_object_offset_t target_start; + vm_object_offset_t offset; + vm_behavior_t behavior; + boolean_t look_behind = TRUE; + boolean_t look_ahead = TRUE; + boolean_t isSSD = FALSE; + uint32_t throttle_limit; + int sequential_run; + int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; + vm_size_t max_ph_size; + vm_size_t min_ph_size; + + assert( !(*length & PAGE_MASK)); + assert( !(*start & PAGE_MASK_64)); + + /* + * remember maxiumum length of run requested + */ + max_length = *length; + /* + * we'll always return a cluster size of at least + * 1 page, since the original fault must always + * be processed + */ + *length = PAGE_SIZE; + *io_streaming = 0; + + if (speculative_reads_disabled || fault_info == NULL) { + /* + * no cluster... just fault the page in + */ + return; } - if (object2_paging) { - vm_object_lock(object2); - vm_object_paging_end(object2); - vm_object_unlock(object2); - object2_paging = FALSE; + orig_start = *start; + target_start = orig_start; + cluster_size = round_page(fault_info->cluster_size); + behavior = fault_info->behavior; + + vm_object_lock(object); + + if (object->pager == MEMORY_OBJECT_NULL) { + goto out; /* pager is gone for this object, nothing more to do */ } + vnode_pager_get_isSSD(object->pager, &isSSD); - return retval; + min_ph_size = round_page(preheat_min_bytes); + max_ph_size = round_page(preheat_max_bytes); + +#if !CONFIG_EMBEDDED + if (isSSD) { + min_ph_size /= 2; + max_ph_size /= 8; + + if (min_ph_size & PAGE_MASK_64) { + min_ph_size = trunc_page(min_ph_size); + } + + if (max_ph_size & PAGE_MASK_64) { + max_ph_size = trunc_page(max_ph_size); + } + } +#endif /* !CONFIG_EMBEDDED */ + + if (min_ph_size < PAGE_SIZE) { + min_ph_size = PAGE_SIZE; + } + + if (max_ph_size < PAGE_SIZE) { + max_ph_size = PAGE_SIZE; + } else if (max_ph_size > MAX_UPL_TRANSFER_BYTES) { + max_ph_size = MAX_UPL_TRANSFER_BYTES; + } + + if (max_length > max_ph_size) { + max_length = max_ph_size; + } + + if (max_length <= PAGE_SIZE) { + goto out; + } + + if (object->internal) { + object_size = object->vo_size; + } else { + vnode_pager_get_object_size(object->pager, &object_size); + } + + object_size = round_page_64(object_size); + + if (orig_start >= object_size) { + /* + * fault occurred beyond the EOF... + * we need to punt w/o changing the + * starting offset + */ + goto out; + } + if (object->pages_used > object->pages_created) { + /* + * must have wrapped our 32 bit counters + * so reset + */ + object->pages_used = object->pages_created = 0; + } + if ((sequential_run = object->sequential)) { + if (sequential_run < 0) { + sequential_behavior = VM_BEHAVIOR_RSEQNTL; + sequential_run = 0 - sequential_run; + } else { + sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; + } + } + switch (behavior) { + default: + behavior = VM_BEHAVIOR_DEFAULT; + + case VM_BEHAVIOR_DEFAULT: + if (object->internal && fault_info->user_tag == VM_MEMORY_STACK) { + goto out; + } + + if (sequential_run >= (3 * PAGE_SIZE)) { + pre_heat_size = sequential_run + PAGE_SIZE; + + if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) { + look_behind = FALSE; + } else { + look_ahead = FALSE; + } + + *io_streaming = 1; + } else { + if (object->pages_created < (20 * (min_ph_size >> PAGE_SHIFT))) { + /* + * prime the pump + */ + pre_heat_size = min_ph_size; + } else { + /* + * Linear growth in PH size: The maximum size is max_length... + * this cacluation will result in a size that is neither a + * power of 2 nor a multiple of PAGE_SIZE... so round + * it up to the nearest PAGE_SIZE boundary + */ + pre_heat_size = (max_length * (uint64_t)object->pages_used) / object->pages_created; + + if (pre_heat_size < min_ph_size) { + pre_heat_size = min_ph_size; + } else { + pre_heat_size = round_page(pre_heat_size); + } + } + } + break; + + case VM_BEHAVIOR_RANDOM: + if ((pre_heat_size = cluster_size) <= PAGE_SIZE) { + goto out; + } + break; + + case VM_BEHAVIOR_SEQUENTIAL: + if ((pre_heat_size = cluster_size) == 0) { + pre_heat_size = sequential_run + PAGE_SIZE; + } + look_behind = FALSE; + *io_streaming = 1; + + break; + + case VM_BEHAVIOR_RSEQNTL: + if ((pre_heat_size = cluster_size) == 0) { + pre_heat_size = sequential_run + PAGE_SIZE; + } + look_ahead = FALSE; + *io_streaming = 1; + + break; + } + throttle_limit = (uint32_t) max_length; + assert(throttle_limit == max_length); + + if (vnode_pager_get_throttle_io_limit(object->pager, &throttle_limit) == KERN_SUCCESS) { + if (max_length > throttle_limit) { + max_length = throttle_limit; + } + } + if (pre_heat_size > max_length) { + pre_heat_size = max_length; + } + + if (behavior == VM_BEHAVIOR_DEFAULT && (pre_heat_size > min_ph_size)) { + unsigned int consider_free = vm_page_free_count + vm_page_cleaned_count; + + if (consider_free < vm_page_throttle_limit) { + pre_heat_size = trunc_page(pre_heat_size / 16); + } else if (consider_free < vm_page_free_target) { + pre_heat_size = trunc_page(pre_heat_size / 4); + } + + if (pre_heat_size < min_ph_size) { + pre_heat_size = min_ph_size; + } + } + if (look_ahead == TRUE) { + if (look_behind == TRUE) { + /* + * if we get here its due to a random access... + * so we want to center the original fault address + * within the cluster we will issue... make sure + * to calculate 'head_size' as a multiple of PAGE_SIZE... + * 'pre_heat_size' is a multiple of PAGE_SIZE but not + * necessarily an even number of pages so we need to truncate + * the result to a PAGE_SIZE boundary + */ + head_size = trunc_page(pre_heat_size / 2); + + if (target_start > head_size) { + target_start -= head_size; + } else { + target_start = 0; + } + + /* + * 'target_start' at this point represents the beginning offset + * of the cluster we are considering... 'orig_start' will be in + * the center of this cluster if we didn't have to clip the start + * due to running into the start of the file + */ + } + if ((target_start + pre_heat_size) > object_size) { + pre_heat_size = (vm_size_t)(round_page_64(object_size - target_start)); + } + /* + * at this point caclulate the number of pages beyond the original fault + * address that we want to consider... this is guaranteed not to extend beyond + * the current EOF... + */ + assert((vm_size_t)(orig_start - target_start) == (orig_start - target_start)); + tail_size = pre_heat_size - (vm_size_t)(orig_start - target_start) - PAGE_SIZE; + } else { + if (pre_heat_size > target_start) { + /* + * since pre_heat_size is always smaller then 2^32, + * if it is larger then target_start (a 64 bit value) + * it is safe to clip target_start to 32 bits + */ + pre_heat_size = (vm_size_t) target_start; + } + tail_size = 0; + } + assert( !(target_start & PAGE_MASK_64)); + assert( !(pre_heat_size & PAGE_MASK_64)); + + if (pre_heat_size <= PAGE_SIZE) { + goto out; + } + + if (look_behind == TRUE) { + /* + * take a look at the pages before the original + * faulting offset... recalculate this in case + * we had to clip 'pre_heat_size' above to keep + * from running past the EOF. + */ + head_size = pre_heat_size - tail_size - PAGE_SIZE; + + for (offset = orig_start - PAGE_SIZE_64; head_size; offset -= PAGE_SIZE_64, head_size -= PAGE_SIZE) { + /* + * don't poke below the lowest offset + */ + if (offset < fault_info->lo_offset) { + break; + } + /* + * for external objects or internal objects w/o a pager, + * VM_COMPRESSOR_PAGER_STATE_GET will return VM_EXTERNAL_STATE_UNKNOWN + */ + if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) == VM_EXTERNAL_STATE_ABSENT) { + break; + } + if (vm_page_lookup(object, offset) != VM_PAGE_NULL) { + /* + * don't bridge resident pages + */ + break; + } + *start = offset; + *length += PAGE_SIZE; + } + } + if (look_ahead == TRUE) { + for (offset = orig_start + PAGE_SIZE_64; tail_size; offset += PAGE_SIZE_64, tail_size -= PAGE_SIZE) { + /* + * don't poke above the highest offset + */ + if (offset >= fault_info->hi_offset) { + break; + } + assert(offset < object_size); + + /* + * for external objects or internal objects w/o a pager, + * VM_COMPRESSOR_PAGER_STATE_GET will return VM_EXTERNAL_STATE_UNKNOWN + */ + if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) == VM_EXTERNAL_STATE_ABSENT) { + break; + } + if (vm_page_lookup(object, offset) != VM_PAGE_NULL) { + /* + * don't bridge resident pages + */ + break; + } + *length += PAGE_SIZE; + } + } +out: + if (*length > max_length) { + *length = max_length; + } + + vm_object_unlock(object); + + DTRACE_VM1(clustersize, vm_size_t, *length); +} + + +/* + * Allow manipulation of individual page state. This is actually part of + * the UPL regimen but takes place on the VM object rather than on a UPL + */ + +kern_return_t +vm_object_page_op( + vm_object_t object, + vm_object_offset_t offset, + int ops, + ppnum_t *phys_entry, + int *flags) +{ + vm_page_t dst_page; + + vm_object_lock(object); + + if (ops & UPL_POP_PHYSICAL) { + if (object->phys_contiguous) { + if (phys_entry) { + *phys_entry = (ppnum_t) + (object->vo_shadow_offset >> PAGE_SHIFT); + } + vm_object_unlock(object); + return KERN_SUCCESS; + } else { + vm_object_unlock(object); + return KERN_INVALID_OBJECT; + } + } + if (object->phys_contiguous) { + vm_object_unlock(object); + return KERN_INVALID_OBJECT; + } + + while (TRUE) { + if ((dst_page = vm_page_lookup(object, offset)) == VM_PAGE_NULL) { + vm_object_unlock(object); + return KERN_FAILURE; + } + + /* Sync up on getting the busy bit */ + if ((dst_page->vmp_busy || dst_page->vmp_cleaning) && + (((ops & UPL_POP_SET) && + (ops & UPL_POP_BUSY)) || (ops & UPL_POP_DUMP))) { + /* someone else is playing with the page, we will */ + /* have to wait */ + PAGE_SLEEP(object, dst_page, THREAD_UNINT); + continue; + } + + if (ops & UPL_POP_DUMP) { + if (dst_page->vmp_pmapped == TRUE) { + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); + } + + VM_PAGE_FREE(dst_page); + break; + } + + if (flags) { + *flags = 0; + + /* Get the condition of flags before requested ops */ + /* are undertaken */ + + if (dst_page->vmp_dirty) { + *flags |= UPL_POP_DIRTY; + } + if (dst_page->vmp_free_when_done) { + *flags |= UPL_POP_PAGEOUT; + } + if (dst_page->vmp_precious) { + *flags |= UPL_POP_PRECIOUS; + } + if (dst_page->vmp_absent) { + *flags |= UPL_POP_ABSENT; + } + if (dst_page->vmp_busy) { + *flags |= UPL_POP_BUSY; + } + } + + /* The caller should have made a call either contingent with */ + /* or prior to this call to set UPL_POP_BUSY */ + if (ops & UPL_POP_SET) { + /* The protection granted with this assert will */ + /* not be complete. If the caller violates the */ + /* convention and attempts to change page state */ + /* without first setting busy we may not see it */ + /* because the page may already be busy. However */ + /* if such violations occur we will assert sooner */ + /* or later. */ + assert(dst_page->vmp_busy || (ops & UPL_POP_BUSY)); + if (ops & UPL_POP_DIRTY) { + SET_PAGE_DIRTY(dst_page, FALSE); + } + if (ops & UPL_POP_PAGEOUT) { + dst_page->vmp_free_when_done = TRUE; + } + if (ops & UPL_POP_PRECIOUS) { + dst_page->vmp_precious = TRUE; + } + if (ops & UPL_POP_ABSENT) { + dst_page->vmp_absent = TRUE; + } + if (ops & UPL_POP_BUSY) { + dst_page->vmp_busy = TRUE; + } + } + + if (ops & UPL_POP_CLR) { + assert(dst_page->vmp_busy); + if (ops & UPL_POP_DIRTY) { + dst_page->vmp_dirty = FALSE; + } + if (ops & UPL_POP_PAGEOUT) { + dst_page->vmp_free_when_done = FALSE; + } + if (ops & UPL_POP_PRECIOUS) { + dst_page->vmp_precious = FALSE; + } + if (ops & UPL_POP_ABSENT) { + dst_page->vmp_absent = FALSE; + } + if (ops & UPL_POP_BUSY) { + dst_page->vmp_busy = FALSE; + PAGE_WAKEUP(dst_page); + } + } + if (phys_entry) { + /* + * The physical page number will remain valid + * only if the page is kept busy. + */ + assert(dst_page->vmp_busy); + *phys_entry = VM_PAGE_GET_PHYS_PAGE(dst_page); + } + + break; + } + + vm_object_unlock(object); + return KERN_SUCCESS; +} + +/* + * vm_object_range_op offers performance enhancement over + * vm_object_page_op for page_op functions which do not require page + * level state to be returned from the call. Page_op was created to provide + * a low-cost alternative to page manipulation via UPLs when only a single + * page was involved. The range_op call establishes the ability in the _op + * family of functions to work on multiple pages where the lack of page level + * state handling allows the caller to avoid the overhead of the upl structures. + */ + +kern_return_t +vm_object_range_op( + vm_object_t object, + vm_object_offset_t offset_beg, + vm_object_offset_t offset_end, + int ops, + uint32_t *range) +{ + vm_object_offset_t offset; + vm_page_t dst_page; + + if (offset_end - offset_beg > (uint32_t) -1) { + /* range is too big and would overflow "*range" */ + return KERN_INVALID_ARGUMENT; + } + if (object->resident_page_count == 0) { + if (range) { + if (ops & UPL_ROP_PRESENT) { + *range = 0; + } else { + *range = (uint32_t) (offset_end - offset_beg); + assert(*range == (offset_end - offset_beg)); + } + } + return KERN_SUCCESS; + } + vm_object_lock(object); + + if (object->phys_contiguous) { + vm_object_unlock(object); + return KERN_INVALID_OBJECT; + } + + offset = offset_beg & ~PAGE_MASK_64; + + while (offset < offset_end) { + dst_page = vm_page_lookup(object, offset); + if (dst_page != VM_PAGE_NULL) { + if (ops & UPL_ROP_DUMP) { + if (dst_page->vmp_busy || dst_page->vmp_cleaning) { + /* + * someone else is playing with the + * page, we will have to wait + */ + PAGE_SLEEP(object, dst_page, THREAD_UNINT); + /* + * need to relook the page up since it's + * state may have changed while we slept + * it might even belong to a different object + * at this point + */ + continue; + } + if (dst_page->vmp_laundry) { + vm_pageout_steal_laundry(dst_page, FALSE); + } + + if (dst_page->vmp_pmapped == TRUE) { + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); + } + + VM_PAGE_FREE(dst_page); + } else if ((ops & UPL_ROP_ABSENT) + && (!dst_page->vmp_absent || dst_page->vmp_busy)) { + break; + } + } else if (ops & UPL_ROP_PRESENT) { + break; + } + + offset += PAGE_SIZE; + } + vm_object_unlock(object); + + if (range) { + if (offset > offset_end) { + offset = offset_end; + } + if (offset > offset_beg) { + *range = (uint32_t) (offset - offset_beg); + assert(*range == (offset - offset_beg)); + } else { + *range = 0; + } + } + return KERN_SUCCESS; +} + +/* + * Used to point a pager directly to a range of memory (when the pager may be associated + * with a non-device vnode). Takes a virtual address, an offset, and a size. We currently + * expect that the virtual address will denote the start of a range that is physically contiguous. + */ +kern_return_t +pager_map_to_phys_contiguous( + memory_object_control_t object, + memory_object_offset_t offset, + addr64_t base_vaddr, + vm_size_t size) +{ + ppnum_t page_num; + boolean_t clobbered_private; + kern_return_t retval; + vm_object_t pager_object; + + page_num = pmap_find_phys(kernel_pmap, base_vaddr); + + if (!page_num) { + retval = KERN_FAILURE; + goto out; + } + + pager_object = memory_object_control_to_vm_object(object); + + if (!pager_object) { + retval = KERN_FAILURE; + goto out; + } + + clobbered_private = pager_object->private; + if (pager_object->private != TRUE) { + vm_object_lock(pager_object); + pager_object->private = TRUE; + vm_object_unlock(pager_object); + } + retval = vm_object_populate_with_private(pager_object, offset, page_num, size); + + if (retval != KERN_SUCCESS) { + if (pager_object->private != clobbered_private) { + vm_object_lock(pager_object); + pager_object->private = clobbered_private; + vm_object_unlock(pager_object); + } + } + +out: + return retval; +} + +uint32_t scan_object_collision = 0; + +void +vm_object_lock(vm_object_t object) +{ + if (object == vm_pageout_scan_wants_object) { + scan_object_collision++; + mutex_pause(2); + } + DTRACE_VM(vm_object_lock_w); + lck_rw_lock_exclusive(&object->Lock); +#if DEVELOPMENT || DEBUG + object->Lock_owner = current_thread(); +#endif +} + +boolean_t +vm_object_lock_avoid(vm_object_t object) +{ + if (object == vm_pageout_scan_wants_object) { + scan_object_collision++; + return TRUE; + } + return FALSE; +} + +boolean_t +_vm_object_lock_try(vm_object_t object) +{ + boolean_t retval; + + retval = lck_rw_try_lock_exclusive(&object->Lock); +#if DEVELOPMENT || DEBUG + if (retval == TRUE) { + DTRACE_VM(vm_object_lock_w); + object->Lock_owner = current_thread(); + } +#endif + return retval; +} + +boolean_t +vm_object_lock_try(vm_object_t object) +{ + /* + * Called from hibernate path so check before blocking. + */ + if (vm_object_lock_avoid(object) && ml_get_interrupts_enabled() && get_preemption_level() == 0) { + mutex_pause(2); + } + return _vm_object_lock_try(object); +} + +void +vm_object_lock_shared(vm_object_t object) +{ + if (vm_object_lock_avoid(object)) { + mutex_pause(2); + } + DTRACE_VM(vm_object_lock_r); + lck_rw_lock_shared(&object->Lock); +} + +boolean_t +vm_object_lock_yield_shared(vm_object_t object) +{ + boolean_t retval = FALSE, force_yield = FALSE;; + + vm_object_lock_assert_shared(object); + + force_yield = vm_object_lock_avoid(object); + + retval = lck_rw_lock_yield_shared(&object->Lock, force_yield); + if (retval) { + DTRACE_VM(vm_object_lock_yield); + } + + return retval; +} + +boolean_t +vm_object_lock_try_shared(vm_object_t object) +{ + boolean_t retval; + + if (vm_object_lock_avoid(object)) { + mutex_pause(2); + } + retval = lck_rw_try_lock_shared(&object->Lock); + if (retval) { + DTRACE_VM(vm_object_lock_r); + } + return retval; +} + +boolean_t +vm_object_lock_upgrade(vm_object_t object) +{ + boolean_t retval; + + retval = lck_rw_lock_shared_to_exclusive(&object->Lock); +#if DEVELOPMENT || DEBUG + if (retval == TRUE) { + DTRACE_VM(vm_object_lock_w); + object->Lock_owner = current_thread(); + } +#endif + return retval; +} + +void +vm_object_unlock(vm_object_t object) +{ +#if DEVELOPMENT || DEBUG + if (object->Lock_owner) { + if (object->Lock_owner != current_thread()) { + panic("vm_object_unlock: not owner - %p\n", object); + } + object->Lock_owner = 0; + DTRACE_VM(vm_object_unlock); + } +#endif + lck_rw_done(&object->Lock); +} + + +unsigned int vm_object_change_wimg_mode_count = 0; + +/* + * The object must be locked + */ +void +vm_object_change_wimg_mode(vm_object_t object, unsigned int wimg_mode) +{ + vm_page_t p; + + vm_object_lock_assert_exclusive(object); + + vm_object_paging_wait(object, THREAD_UNINT); + + vm_page_queue_iterate(&object->memq, p, vmp_listq) { + if (!p->vmp_fictitious) { + pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(p), wimg_mode); + } + } + if (wimg_mode == VM_WIMG_USE_DEFAULT) { + object->set_cache_attr = FALSE; + } else { + object->set_cache_attr = TRUE; + } + + object->wimg_bits = wimg_mode; + + vm_object_change_wimg_mode_count++; +} + +#if CONFIG_FREEZE + +/* + * This routine does the "relocation" of previously + * compressed pages belonging to this object that are + * residing in a number of compressed segments into + * a set of compressed segments dedicated to hold + * compressed pages belonging to this object. + */ + +extern void *freezer_chead; +extern char *freezer_compressor_scratch_buf; +extern int c_freezer_compression_count; +extern AbsoluteTime c_freezer_last_yield_ts; + +#define MAX_FREE_BATCH 32 +#define FREEZER_DUTY_CYCLE_ON_MS 5 +#define FREEZER_DUTY_CYCLE_OFF_MS 5 + +static int c_freezer_should_yield(void); + + +static int +c_freezer_should_yield() +{ + AbsoluteTime cur_time; + uint64_t nsecs; + + assert(c_freezer_last_yield_ts); + clock_get_uptime(&cur_time); + + SUB_ABSOLUTETIME(&cur_time, &c_freezer_last_yield_ts); + absolutetime_to_nanoseconds(cur_time, &nsecs); + + if (nsecs > 1000 * 1000 * FREEZER_DUTY_CYCLE_ON_MS) { + return 1; + } + return 0; +} + + +void +vm_object_compressed_freezer_done() +{ + vm_compressor_finished_filling(&freezer_chead); +} + + +uint32_t +vm_object_compressed_freezer_pageout( + vm_object_t object, uint32_t dirty_budget) +{ + vm_page_t p; + vm_page_t local_freeq = NULL; + int local_freed = 0; + kern_return_t retval = KERN_SUCCESS; + int obj_resident_page_count_snapshot = 0; + uint32_t paged_out_count = 0; + + assert(object != VM_OBJECT_NULL); + assert(object->internal); + + vm_object_lock(object); + + if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) { + if (!object->pager_initialized) { + vm_object_collapse(object, (vm_object_offset_t) 0, TRUE); + + if (!object->pager_initialized) { + vm_object_compressor_pager_create(object); + } + } + + if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) { + vm_object_unlock(object); + return paged_out_count; + } + } + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + vm_object_offset_t curr_offset = 0; + + /* + * Go through the object and make sure that any + * previously compressed pages are relocated into + * a compressed segment associated with our "freezer_chead". + */ + while (curr_offset < object->vo_size) { + curr_offset = vm_compressor_pager_next_compressed(object->pager, curr_offset); + + if (curr_offset == (vm_object_offset_t) -1) { + break; + } + + retval = vm_compressor_pager_relocate(object->pager, curr_offset, &freezer_chead); + + if (retval != KERN_SUCCESS) { + break; + } + + curr_offset += PAGE_SIZE_64; + } + } + + /* + * We can't hold the object lock while heading down into the compressed pager + * layer because we might need the kernel map lock down there to allocate new + * compressor data structures. And if this same object is mapped in the kernel + * and there's a fault on it, then that thread will want the object lock while + * holding the kernel map lock. + * + * Since we are going to drop/grab the object lock repeatedly, we must make sure + * we won't be stuck in an infinite loop if the same page(s) keep getting + * decompressed. So we grab a snapshot of the number of pages in the object and + * we won't process any more than that number of pages. + */ + + obj_resident_page_count_snapshot = object->resident_page_count; + + vm_object_activity_begin(object); + + while ((obj_resident_page_count_snapshot--) && !vm_page_queue_empty(&object->memq) && paged_out_count < dirty_budget) { + p = (vm_page_t)vm_page_queue_first(&object->memq); + + KERNEL_DEBUG(0xe0430004 | DBG_FUNC_START, object, local_freed, 0, 0, 0); + + vm_page_lockspin_queues(); + + if (p->vmp_cleaning || p->vmp_fictitious || p->vmp_busy || p->vmp_absent || p->vmp_unusual || p->vmp_error || VM_PAGE_WIRED(p)) { + vm_page_unlock_queues(); + + KERNEL_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed, 1, 0, 0); + + vm_page_queue_remove(&object->memq, p, vmp_listq); + vm_page_queue_enter(&object->memq, p, vmp_listq); + + continue; + } + + if (p->vmp_pmapped == TRUE) { + int refmod_state, pmap_flags; + + if (p->vmp_dirty || p->vmp_precious) { + pmap_flags = PMAP_OPTIONS_COMPRESSOR; + } else { + pmap_flags = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; + } + + refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(p), pmap_flags, NULL); + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(p, FALSE); + } + } + + if (p->vmp_dirty == FALSE && p->vmp_precious == FALSE) { + /* + * Clean and non-precious page. + */ + vm_page_unlock_queues(); + VM_PAGE_FREE(p); + + KERNEL_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed, 2, 0, 0); + continue; + } + + if (p->vmp_laundry) { + vm_pageout_steal_laundry(p, TRUE); + } + + vm_page_queues_remove(p, TRUE); + + vm_page_unlock_queues(); + + + /* + * In case the compressor fails to compress this page, we need it at + * the back of the object memq so that we don't keep trying to process it. + * Make the move here while we have the object lock held. + */ + + vm_page_queue_remove(&object->memq, p, vmp_listq); + vm_page_queue_enter(&object->memq, p, vmp_listq); + + /* + * Grab an activity_in_progress here for vm_pageout_compress_page() to consume. + * + * Mark the page busy so no one messes with it while we have the object lock dropped. + */ + p->vmp_busy = TRUE; + + vm_object_activity_begin(object); + + vm_object_unlock(object); + + if (vm_pageout_compress_page(&freezer_chead, freezer_compressor_scratch_buf, p) == KERN_SUCCESS) { + /* + * page has already been un-tabled from the object via 'vm_page_remove' + */ + p->vmp_snext = local_freeq; + local_freeq = p; + local_freed++; + paged_out_count++; + + if (local_freed >= MAX_FREE_BATCH) { + OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions); + + vm_page_free_list(local_freeq, TRUE); + + local_freeq = NULL; + local_freed = 0; + } + c_freezer_compression_count++; + } + KERNEL_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed, 0, 0, 0); + + if (local_freed == 0 && c_freezer_should_yield()) { + thread_yield_internal(FREEZER_DUTY_CYCLE_OFF_MS); + clock_get_uptime(&c_freezer_last_yield_ts); + } + + vm_object_lock(object); + } + + if (local_freeq) { + OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions); + + vm_page_free_list(local_freeq, TRUE); + + local_freeq = NULL; + local_freed = 0; + } + + vm_object_activity_end(object); + + vm_object_unlock(object); + + if (c_freezer_should_yield()) { + thread_yield_internal(FREEZER_DUTY_CYCLE_OFF_MS); + clock_get_uptime(&c_freezer_last_yield_ts); + } + return paged_out_count; +} + +#endif /* CONFIG_FREEZE */ + + +void +vm_object_pageout( + vm_object_t object) +{ + vm_page_t p, next; + struct vm_pageout_queue *iq; + + if (!VM_CONFIG_COMPRESSOR_IS_PRESENT) { + return; + } + + iq = &vm_pageout_queue_internal; + + assert(object != VM_OBJECT_NULL ); + + vm_object_lock(object); + + if (!object->internal || + object->terminating || + !object->alive) { + vm_object_unlock(object); + return; + } + + if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) { + if (!object->pager_initialized) { + vm_object_collapse(object, (vm_object_offset_t) 0, TRUE); + + if (!object->pager_initialized) { + vm_object_compressor_pager_create(object); + } + } + + if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) { + vm_object_unlock(object); + return; + } + } + +ReScan: + next = (vm_page_t)vm_page_queue_first(&object->memq); + + while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next)) { + p = next; + next = (vm_page_t)vm_page_queue_next(&next->vmp_listq); + + assert(p->vmp_q_state != VM_PAGE_ON_FREE_Q); + + if ((p->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) || + p->vmp_cleaning || + p->vmp_laundry || + p->vmp_busy || + p->vmp_absent || + p->vmp_error || + p->vmp_fictitious || + VM_PAGE_WIRED(p)) { + /* + * Page is already being cleaned or can't be cleaned. + */ + continue; + } + if (vm_compressor_low_on_space()) { + break; + } + + /* Throw to the pageout queue */ + + vm_page_lockspin_queues(); + + if (VM_PAGE_Q_THROTTLED(iq)) { + iq->pgo_draining = TRUE; + + assert_wait((event_t) (&iq->pgo_laundry + 1), + THREAD_INTERRUPTIBLE); + vm_page_unlock_queues(); + vm_object_unlock(object); + + thread_block(THREAD_CONTINUE_NULL); + + vm_object_lock(object); + goto ReScan; + } + + assert(!p->vmp_fictitious); + assert(!p->vmp_busy); + assert(!p->vmp_absent); + assert(!p->vmp_unusual); + assert(!p->vmp_error); + assert(!VM_PAGE_WIRED(p)); + assert(!p->vmp_cleaning); + + if (p->vmp_pmapped == TRUE) { + int refmod_state; + int pmap_options; + + /* + * Tell pmap the page should be accounted + * for as "compressed" if it's been modified. + */ + pmap_options = + PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; + if (p->vmp_dirty || p->vmp_precious) { + /* + * We already know it's been modified, + * so tell pmap to account for it + * as "compressed". + */ + pmap_options = PMAP_OPTIONS_COMPRESSOR; + } + refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(p), + pmap_options, + NULL); + if (refmod_state & VM_MEM_MODIFIED) { + SET_PAGE_DIRTY(p, FALSE); + } + } + + if (!p->vmp_dirty && !p->vmp_precious) { + vm_page_unlock_queues(); + VM_PAGE_FREE(p); + continue; + } + vm_page_queues_remove(p, TRUE); + + vm_pageout_cluster(p); + + vm_page_unlock_queues(); + } + vm_object_unlock(object); +} + + +#if CONFIG_IOSCHED +void +vm_page_request_reprioritize(vm_object_t o, uint64_t blkno, uint32_t len, int prio) +{ + io_reprioritize_req_t req; + struct vnode *devvp = NULL; + + if (vnode_pager_get_object_devvp(o->pager, (uintptr_t *)&devvp) != KERN_SUCCESS) { + return; + } + + /* + * Create the request for I/O reprioritization. + * We use the noblock variant of zalloc because we're holding the object + * lock here and we could cause a deadlock in low memory conditions. + */ + req = (io_reprioritize_req_t)zalloc_noblock(io_reprioritize_req_zone); + if (req == NULL) { + return; + } + req->blkno = blkno; + req->len = len; + req->priority = prio; + req->devvp = devvp; + + /* Insert request into the reprioritization list */ + IO_REPRIORITIZE_LIST_LOCK(); + queue_enter(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list); + IO_REPRIORITIZE_LIST_UNLOCK(); + + /* Wakeup reprioritize thread */ + IO_REPRIO_THREAD_WAKEUP(); + + return; +} + +void +vm_decmp_upl_reprioritize(upl_t upl, int prio) +{ + int offset; + vm_object_t object; + io_reprioritize_req_t req; + struct vnode *devvp = NULL; + uint64_t blkno; + uint32_t len; + upl_t io_upl; + uint64_t *io_upl_reprio_info; + int io_upl_size; + + if ((upl->flags & UPL_TRACKED_BY_OBJECT) == 0 || (upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) { + return; + } + + /* + * We dont want to perform any allocations with the upl lock held since that might + * result in a deadlock. If the system is low on memory, the pageout thread would + * try to pageout stuff and might wait on this lock. If we are waiting for the memory to + * be freed up by the pageout thread, it would be a deadlock. + */ + + + /* First step is just to get the size of the upl to find out how big the reprio info is */ + if (!upl_try_lock(upl)) { + return; + } + + if (upl->decmp_io_upl == NULL) { + /* The real I/O upl was destroyed by the time we came in here. Nothing to do. */ + upl_unlock(upl); + return; + } + + io_upl = upl->decmp_io_upl; + assert((io_upl->flags & UPL_DECMP_REAL_IO) != 0); + io_upl_size = io_upl->size; + upl_unlock(upl); + + /* Now perform the allocation */ + io_upl_reprio_info = (uint64_t *)kalloc(sizeof(uint64_t) * (io_upl_size / PAGE_SIZE)); + if (io_upl_reprio_info == NULL) { + return; + } + + /* Now again take the lock, recheck the state and grab out the required info */ + if (!upl_try_lock(upl)) { + goto out; + } + + if (upl->decmp_io_upl == NULL || upl->decmp_io_upl != io_upl) { + /* The real I/O upl was destroyed by the time we came in here. Nothing to do. */ + upl_unlock(upl); + goto out; + } + memcpy(io_upl_reprio_info, io_upl->upl_reprio_info, sizeof(uint64_t) * (io_upl_size / PAGE_SIZE)); + + /* Get the VM object for this UPL */ + if (io_upl->flags & UPL_SHADOWED) { + object = io_upl->map_object->shadow; + } else { + object = io_upl->map_object; + } + + /* Get the dev vnode ptr for this object */ + if (!object || !object->pager || + vnode_pager_get_object_devvp(object->pager, (uintptr_t *)&devvp) != KERN_SUCCESS) { + upl_unlock(upl); + goto out; + } + + upl_unlock(upl); + + /* Now we have all the information needed to do the expedite */ + + offset = 0; + while (offset < io_upl_size) { + blkno = io_upl_reprio_info[(offset / PAGE_SIZE)] & UPL_REPRIO_INFO_MASK; + len = (io_upl_reprio_info[(offset / PAGE_SIZE)] >> UPL_REPRIO_INFO_SHIFT) & UPL_REPRIO_INFO_MASK; + + /* + * This implementation may cause some spurious expedites due to the + * fact that we dont cleanup the blkno & len from the upl_reprio_info + * even after the I/O is complete. + */ + + if (blkno != 0 && len != 0) { + /* Create the request for I/O reprioritization */ + req = (io_reprioritize_req_t)zalloc(io_reprioritize_req_zone); + assert(req != NULL); + req->blkno = blkno; + req->len = len; + req->priority = prio; + req->devvp = devvp; + + /* Insert request into the reprioritization list */ + IO_REPRIORITIZE_LIST_LOCK(); + queue_enter(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list); + IO_REPRIORITIZE_LIST_UNLOCK(); + + offset += len; + } else { + offset += PAGE_SIZE; + } + } + + /* Wakeup reprioritize thread */ + IO_REPRIO_THREAD_WAKEUP(); + +out: + kfree(io_upl_reprio_info, sizeof(uint64_t) * (io_upl_size / PAGE_SIZE)); + return; +} + +void +vm_page_handle_prio_inversion(vm_object_t o, vm_page_t m) +{ + upl_t upl; + upl_page_info_t *pl; + unsigned int i, num_pages; + int cur_tier; + + cur_tier = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO); + + /* + * Scan through all UPLs associated with the object to find the + * UPL containing the contended page. + */ + queue_iterate(&o->uplq, upl, upl_t, uplq) { + if (((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) || upl->upl_priority <= cur_tier) { + continue; + } + pl = UPL_GET_INTERNAL_PAGE_LIST(upl); + num_pages = (upl->size / PAGE_SIZE); + + /* + * For each page in the UPL page list, see if it matches the contended + * page and was issued as a low prio I/O. + */ + for (i = 0; i < num_pages; i++) { + if (UPL_PAGE_PRESENT(pl, i) && VM_PAGE_GET_PHYS_PAGE(m) == pl[i].phys_addr) { + if ((upl->flags & UPL_DECMP_REQ) && upl->decmp_io_upl) { + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_EXPEDITE)) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(upl->upl_creator), VM_KERNEL_UNSLIDE_OR_PERM(m), + VM_KERNEL_UNSLIDE_OR_PERM(upl), upl->upl_priority, 0); + vm_decmp_upl_reprioritize(upl, cur_tier); + break; + } + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_EXPEDITE)) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(upl->upl_creator), VM_KERNEL_UNSLIDE_OR_PERM(m), + upl->upl_reprio_info[i], upl->upl_priority, 0); + if (UPL_REPRIO_INFO_BLKNO(upl, i) != 0 && UPL_REPRIO_INFO_LEN(upl, i) != 0) { + vm_page_request_reprioritize(o, UPL_REPRIO_INFO_BLKNO(upl, i), UPL_REPRIO_INFO_LEN(upl, i), cur_tier); + } + break; + } + } + /* Check if we found any hits */ + if (i != num_pages) { + break; + } + } + + return; +} + +wait_result_t +vm_page_sleep(vm_object_t o, vm_page_t m, int interruptible) +{ + wait_result_t ret; + + KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_START, o, m, 0, 0, 0); + + if (o->io_tracking && ((m->vmp_busy == TRUE) || (m->vmp_cleaning == TRUE) || VM_PAGE_WIRED(m))) { + /* + * Indicates page is busy due to an I/O. Issue a reprioritize request if necessary. + */ + vm_page_handle_prio_inversion(o, m); + } + m->vmp_wanted = TRUE; + ret = thread_sleep_vm_object(o, m, interruptible); + KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_END, o, m, 0, 0, 0); + return ret; +} + +static void +io_reprioritize_thread(void *param __unused, wait_result_t wr __unused) +{ + io_reprioritize_req_t req = NULL; + + while (1) { + IO_REPRIORITIZE_LIST_LOCK(); + if (queue_empty(&io_reprioritize_list)) { + IO_REPRIORITIZE_LIST_UNLOCK(); + break; + } + + queue_remove_first(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list); + IO_REPRIORITIZE_LIST_UNLOCK(); + + vnode_pager_issue_reprioritize_io(req->devvp, req->blkno, req->len, req->priority); + zfree(io_reprioritize_req_zone, req); + } + + IO_REPRIO_THREAD_CONTINUATION(); +} +#endif + +#if VM_OBJECT_ACCESS_TRACKING +void +vm_object_access_tracking( + vm_object_t object, + int *access_tracking_p, + uint32_t *access_tracking_reads_p, + uint32_t *access_tracking_writes_p) +{ + int access_tracking; + + access_tracking = !!*access_tracking_p; + + vm_object_lock(object); + *access_tracking_p = object->access_tracking; + if (access_tracking_reads_p) { + *access_tracking_reads_p = object->access_tracking_reads; + } + if (access_tracking_writes_p) { + *access_tracking_writes_p = object->access_tracking_writes; + } + object->access_tracking = access_tracking; + object->access_tracking_reads = 0; + object->access_tracking_writes = 0; + vm_object_unlock(object); + + if (access_tracking) { + vm_object_pmap_protect_options(object, + 0, + object->vo_size, + PMAP_NULL, + 0, + VM_PROT_NONE, + 0); + } +} +#endif /* VM_OBJECT_ACCESS_TRACKING */ + +void +vm_object_ledger_tag_ledgers( + vm_object_t object, + int *ledger_idx_volatile, + int *ledger_idx_nonvolatile, + int *ledger_idx_volatile_compressed, + int *ledger_idx_nonvolatile_compressed, + boolean_t *do_footprint) +{ + assert(object->shadow == VM_OBJECT_NULL); + + *do_footprint = !object->vo_no_footprint; + + switch (object->vo_ledger_tag) { + case VM_LEDGER_TAG_NONE: + /* + * Regular purgeable memory: + * counts in footprint only when nonvolatile. + */ + *do_footprint = TRUE; + assert(object->purgable != VM_PURGABLE_DENY); + *ledger_idx_volatile = task_ledgers.purgeable_volatile; + *ledger_idx_nonvolatile = task_ledgers.purgeable_nonvolatile; + *ledger_idx_volatile_compressed = task_ledgers.purgeable_volatile_compressed; + *ledger_idx_nonvolatile_compressed = task_ledgers.purgeable_nonvolatile_compressed; + break; + case VM_LEDGER_TAG_DEFAULT: + /* + * "default" tagged memory: + * counts in footprint only when nonvolatile and not marked + * as "no_footprint". + */ + *ledger_idx_volatile = task_ledgers.tagged_nofootprint; + *ledger_idx_volatile_compressed = task_ledgers.tagged_nofootprint_compressed; + if (*do_footprint) { + *ledger_idx_nonvolatile = task_ledgers.tagged_footprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.tagged_footprint_compressed; + } else { + *ledger_idx_nonvolatile = task_ledgers.tagged_nofootprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.tagged_nofootprint_compressed; + } + break; + case VM_LEDGER_TAG_NETWORK: + /* + * "network" tagged memory: + * never counts in footprint. + */ + *do_footprint = FALSE; + *ledger_idx_volatile = task_ledgers.network_volatile; + *ledger_idx_volatile_compressed = task_ledgers.network_volatile_compressed; + *ledger_idx_nonvolatile = task_ledgers.network_nonvolatile; + *ledger_idx_nonvolatile_compressed = task_ledgers.network_nonvolatile_compressed; + break; + case VM_LEDGER_TAG_MEDIA: + /* + * "media" tagged memory: + * counts in footprint only when nonvolatile and not marked + * as "no footprint". + */ + *ledger_idx_volatile = task_ledgers.media_nofootprint; + *ledger_idx_volatile_compressed = task_ledgers.media_nofootprint_compressed; + if (*do_footprint) { + *ledger_idx_nonvolatile = task_ledgers.media_footprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.media_footprint_compressed; + } else { + *ledger_idx_nonvolatile = task_ledgers.media_nofootprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.media_nofootprint_compressed; + } + break; + case VM_LEDGER_TAG_GRAPHICS: + /* + * "graphics" tagged memory: + * counts in footprint only when nonvolatile and not marked + * as "no footprint". + */ + *ledger_idx_volatile = task_ledgers.graphics_nofootprint; + *ledger_idx_volatile_compressed = task_ledgers.graphics_nofootprint_compressed; + if (*do_footprint) { + *ledger_idx_nonvolatile = task_ledgers.graphics_footprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.graphics_footprint_compressed; + } else { + *ledger_idx_nonvolatile = task_ledgers.graphics_nofootprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.graphics_nofootprint_compressed; + } + break; + case VM_LEDGER_TAG_NEURAL: + /* + * "neural" tagged memory: + * counts in footprint only when nonvolatile and not marked + * as "no footprint". + */ + *ledger_idx_volatile = task_ledgers.neural_nofootprint; + *ledger_idx_volatile_compressed = task_ledgers.neural_nofootprint_compressed; + if (*do_footprint) { + *ledger_idx_nonvolatile = task_ledgers.neural_footprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.neural_footprint_compressed; + } else { + *ledger_idx_nonvolatile = task_ledgers.neural_nofootprint; + *ledger_idx_nonvolatile_compressed = task_ledgers.neural_nofootprint_compressed; + } + break; + default: + panic("%s: object %p has unsupported ledger_tag %d\n", + __FUNCTION__, object, object->vo_ledger_tag); + } +} + +kern_return_t +vm_object_ownership_change( + vm_object_t object, + int new_ledger_tag, + task_t new_owner, + int new_ledger_flags, + boolean_t old_task_objq_locked) +{ + int old_ledger_tag; + task_t old_owner; + int resident_count, wired_count; + unsigned int compressed_count; + int ledger_idx_volatile; + int ledger_idx_nonvolatile; + int ledger_idx_volatile_compressed; + int ledger_idx_nonvolatile_compressed; + int ledger_idx; + int ledger_idx_compressed; + boolean_t do_footprint, old_no_footprint, new_no_footprint; + boolean_t new_task_objq_locked; + + vm_object_lock_assert_exclusive(object); + + if (!object->internal) { + return KERN_INVALID_ARGUMENT; + } + if (new_ledger_tag == VM_LEDGER_TAG_NONE && + object->purgable == VM_PURGABLE_DENY) { + /* non-purgeable memory must have a valid non-zero ledger tag */ + return KERN_INVALID_ARGUMENT; + } + if (new_ledger_tag < 0 || + new_ledger_tag > VM_LEDGER_TAG_MAX) { + return KERN_INVALID_ARGUMENT; + } + if (new_ledger_flags & ~VM_LEDGER_FLAGS) { + return KERN_INVALID_ARGUMENT; + } + if (object->vo_ledger_tag == VM_LEDGER_TAG_NONE && + object->purgable == VM_PURGABLE_DENY) { + /* + * This VM object is neither ledger-tagged nor purgeable. + * We can convert it to "ledger tag" ownership iff it + * has not been used at all yet (no resident pages and + * no pager) and it's going to be assigned to a valid task. + */ + if (object->resident_page_count != 0 || + object->pager != NULL || + object->pager_created || + object->ref_count != 1 || + object->vo_owner != TASK_NULL || + object->copy_strategy != MEMORY_OBJECT_COPY_NONE || + new_owner == TASK_NULL) { + return KERN_FAILURE; + } + } + + if (new_ledger_flags & VM_LEDGER_FLAG_NO_FOOTPRINT) { + new_no_footprint = TRUE; + } else { + new_no_footprint = FALSE; + } +#if __arm64__ + if (!new_no_footprint && + object->purgable != VM_PURGABLE_DENY && + new_owner != TASK_NULL && + new_owner != VM_OBJECT_OWNER_DISOWNED && + new_owner->task_legacy_footprint) { + /* + * This task has been granted "legacy footprint" and should + * not be charged for its IOKit purgeable memory. Since we + * might now change the accounting of such memory to the + * "graphics" ledger, for example, give it the "no footprint" + * option. + */ + new_no_footprint = TRUE; + } +#endif /* __arm64__ */ + assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE); + assert(object->shadow == VM_OBJECT_NULL); + assert(object->copy == VM_OBJECT_NULL); + + old_ledger_tag = object->vo_ledger_tag; + old_no_footprint = object->vo_no_footprint; + old_owner = VM_OBJECT_OWNER(object); + + DTRACE_VM7(object_ownership_change, + vm_object_t, object, + task_t, old_owner, + int, old_ledger_tag, + int, old_no_footprint, + task_t, new_owner, + int, new_ledger_tag, + int, new_no_footprint); + + assert(object->internal); + resident_count = object->resident_page_count - object->wired_page_count; + wired_count = object->wired_page_count; + compressed_count = vm_compressor_pager_get_count(object->pager); + + /* + * Deal with the old owner and/or ledger tag, if needed. + */ + if (old_owner != TASK_NULL && + ((old_owner != new_owner) /* new owner ... */ + || /* ... or ... */ + (old_no_footprint != new_no_footprint) /* new "no_footprint" */ + || /* ... or ... */ + old_ledger_tag != new_ledger_tag)) { /* ... new ledger */ + /* + * Take this object off of the old owner's ledgers. + */ + vm_object_ledger_tag_ledgers(object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); + if (object->purgable == VM_PURGABLE_VOLATILE || + object->purgable == VM_PURGABLE_EMPTY) { + ledger_idx = ledger_idx_volatile; + ledger_idx_compressed = ledger_idx_volatile_compressed; + } else { + ledger_idx = ledger_idx_nonvolatile; + ledger_idx_compressed = ledger_idx_nonvolatile_compressed; + } + if (resident_count) { + /* + * Adjust the appropriate old owners's ledgers by the + * number of resident pages. + */ + ledger_debit(old_owner->ledger, + ledger_idx, + ptoa_64(resident_count)); + /* adjust old owner's footprint */ + if (do_footprint && + object->purgable != VM_PURGABLE_VOLATILE && + object->purgable != VM_PURGABLE_EMPTY) { + ledger_debit(old_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(resident_count)); + } + } + if (wired_count) { + /* wired pages are always nonvolatile */ + ledger_debit(old_owner->ledger, + ledger_idx_nonvolatile, + ptoa_64(wired_count)); + if (do_footprint) { + ledger_debit(old_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(wired_count)); + } + } + if (compressed_count) { + /* + * Adjust the appropriate old owner's ledgers + * by the number of compressed pages. + */ + ledger_debit(old_owner->ledger, + ledger_idx_compressed, + ptoa_64(compressed_count)); + if (do_footprint && + object->purgable != VM_PURGABLE_VOLATILE && + object->purgable != VM_PURGABLE_EMPTY) { + ledger_debit(old_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(compressed_count)); + } + } + if (old_owner != new_owner) { + /* remove object from old_owner's list of owned objects */ + DTRACE_VM2(object_owner_remove, + vm_object_t, object, + task_t, old_owner); + if (!old_task_objq_locked) { + task_objq_lock(old_owner); + } + old_owner->task_owned_objects--; + queue_remove(&old_owner->task_objq, object, + vm_object_t, task_objq); + switch (object->purgable) { + case VM_PURGABLE_NONVOLATILE: + case VM_PURGABLE_EMPTY: + vm_purgeable_nonvolatile_owner_update(old_owner, + -1); + break; + case VM_PURGABLE_VOLATILE: + vm_purgeable_volatile_owner_update(old_owner, + -1); + break; + default: + break; + } + if (!old_task_objq_locked) { + task_objq_unlock(old_owner); + } + } + } + + /* + * Switch to new ledger tag and/or owner. + */ + + new_task_objq_locked = FALSE; + if (new_owner != old_owner && + new_owner != TASK_NULL && + new_owner != VM_OBJECT_OWNER_DISOWNED) { + /* + * If the new owner is not accepting new objects ("disowning"), + * the object becomes "disowned" and will be added to + * the kernel's task_objq. + * + * Check first without locking, to avoid blocking while the + * task is disowning its objects. + */ + if (new_owner->task_objects_disowning) { + new_owner = VM_OBJECT_OWNER_DISOWNED; + } else { + task_objq_lock(new_owner); + /* check again now that we have the lock */ + if (new_owner->task_objects_disowning) { + new_owner = VM_OBJECT_OWNER_DISOWNED; + task_objq_unlock(new_owner); + } else { + new_task_objq_locked = TRUE; + } + } + } + + object->vo_ledger_tag = new_ledger_tag; + object->vo_owner = new_owner; + object->vo_no_footprint = new_no_footprint; + + if (new_owner == VM_OBJECT_OWNER_DISOWNED) { + /* + * Disowned objects are added to the kernel's task_objq but + * are marked as owned by "VM_OBJECT_OWNER_DISOWNED" to + * differentiate them from objects intentionally owned by + * the kernel. + */ + assert(old_owner != kernel_task); + new_owner = kernel_task; + assert(!new_task_objq_locked); + task_objq_lock(new_owner); + new_task_objq_locked = TRUE; + } + + /* + * Deal with the new owner and/or ledger tag, if needed. + */ + if (new_owner != TASK_NULL && + ((new_owner != old_owner) /* new owner ... */ + || /* ... or ... */ + (new_no_footprint != old_no_footprint) /* ... new "no_footprint" */ + || /* ... or ... */ + new_ledger_tag != old_ledger_tag)) { /* ... new ledger */ + /* + * Add this object to the new owner's ledgers. + */ + vm_object_ledger_tag_ledgers(object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); + if (object->purgable == VM_PURGABLE_VOLATILE || + object->purgable == VM_PURGABLE_EMPTY) { + ledger_idx = ledger_idx_volatile; + ledger_idx_compressed = ledger_idx_volatile_compressed; + } else { + ledger_idx = ledger_idx_nonvolatile; + ledger_idx_compressed = ledger_idx_nonvolatile_compressed; + } + if (resident_count) { + /* + * Adjust the appropriate new owners's ledgers by the + * number of resident pages. + */ + ledger_credit(new_owner->ledger, + ledger_idx, + ptoa_64(resident_count)); + /* adjust new owner's footprint */ + if (do_footprint && + object->purgable != VM_PURGABLE_VOLATILE && + object->purgable != VM_PURGABLE_EMPTY) { + ledger_credit(new_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(resident_count)); + } + } + if (wired_count) { + /* wired pages are always nonvolatile */ + ledger_credit(new_owner->ledger, + ledger_idx_nonvolatile, + ptoa_64(wired_count)); + if (do_footprint) { + ledger_credit(new_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(wired_count)); + } + } + if (compressed_count) { + /* + * Adjust the new owner's ledgers by the number of + * compressed pages. + */ + ledger_credit(new_owner->ledger, + ledger_idx_compressed, + ptoa_64(compressed_count)); + if (do_footprint && + object->purgable != VM_PURGABLE_VOLATILE && + object->purgable != VM_PURGABLE_EMPTY) { + ledger_credit(new_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(compressed_count)); + } + } + if (new_owner != old_owner) { + /* add object to new_owner's list of owned objects */ + DTRACE_VM2(object_owner_add, + vm_object_t, object, + task_t, new_owner); + assert(new_task_objq_locked); + new_owner->task_owned_objects++; + queue_enter(&new_owner->task_objq, object, + vm_object_t, task_objq); + switch (object->purgable) { + case VM_PURGABLE_NONVOLATILE: + case VM_PURGABLE_EMPTY: + vm_purgeable_nonvolatile_owner_update(new_owner, + +1); + break; + case VM_PURGABLE_VOLATILE: + vm_purgeable_volatile_owner_update(new_owner, + +1); + break; + default: + break; + } + } + } + + if (new_task_objq_locked) { + task_objq_unlock(new_owner); + } + + return KERN_SUCCESS; +} + +void +vm_owned_objects_disown( + task_t task) +{ + vm_object_t next_object; + vm_object_t object; + int collisions; + kern_return_t kr; + + if (task == NULL) { + return; + } + + collisions = 0; + +again: + if (task->task_objects_disowned) { + /* task has already disowned its owned objects */ + assert(task->task_volatile_objects == 0); + assert(task->task_nonvolatile_objects == 0); + assert(task->task_owned_objects == 0); + return; + } + + task_objq_lock(task); + + task->task_objects_disowning = TRUE; + + for (object = (vm_object_t) queue_first(&task->task_objq); + !queue_end(&task->task_objq, (queue_entry_t) object); + object = next_object) { + if (task->task_nonvolatile_objects == 0 && + task->task_volatile_objects == 0 && + task->task_owned_objects == 0) { + /* no more objects owned by "task" */ + break; + } + + next_object = (vm_object_t) queue_next(&object->task_objq); + +#if DEBUG + assert(object->vo_purgeable_volatilizer == NULL); +#endif /* DEBUG */ + assert(object->vo_owner == task); + if (!vm_object_lock_try(object)) { + task_objq_unlock(task); + mutex_pause(collisions++); + goto again; + } + /* transfer ownership to the kernel */ + assert(VM_OBJECT_OWNER(object) != kernel_task); + kr = vm_object_ownership_change( + object, + object->vo_ledger_tag, /* unchanged */ + VM_OBJECT_OWNER_DISOWNED, /* new owner */ + 0, /* new_ledger_flags */ + TRUE); /* old_owner->task_objq locked */ + assert(kr == KERN_SUCCESS); + assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED); + vm_object_unlock(object); + } + + if (__improbable(task->task_volatile_objects != 0 || + task->task_nonvolatile_objects != 0 || + task->task_owned_objects != 0)) { + panic("%s(%p): volatile=%d nonvolatile=%d owned=%d q=%p q_first=%p q_last=%p", + __FUNCTION__, + task, + task->task_volatile_objects, + task->task_nonvolatile_objects, + task->task_owned_objects, + &task->task_objq, + queue_first(&task->task_objq), + queue_last(&task->task_objq)); + } + + /* there shouldn't be any objects owned by task now */ + assert(task->task_volatile_objects == 0); + assert(task->task_nonvolatile_objects == 0); + assert(task->task_owned_objects == 0); + assert(task->task_objects_disowning); + + /* and we don't need to try and disown again */ + task->task_objects_disowned = TRUE; + + task_objq_unlock(task); }