X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/d7e50217d7adf6e52786a38bcaa4cd698cb9a79e..91447636331957f3d9b5ca5b508f07c526b0074d:/osfmk/vm/vm_pageout.c diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index ab64d42ff..d75ec79de 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -1,24 +1,21 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. * * @APPLE_LICENSE_HEADER_END@ */ @@ -60,6 +57,9 @@ * The proverbial page-out daemon. */ +#include + +#include #include #include #include @@ -70,34 +70,60 @@ #include #include #include +#include +#include #include #include -#include + +#include #include +#include +#include +#include #include #include +#include + +#include + #include #include #include #include #include #include -#include -#include +#include /* must be last */ + +/* + * ENCRYPTED SWAP: + */ +#ifdef __ppc__ +#include +#endif /* __ppc__ */ +#include <../bsd/crypto/aes/aes.h> extern ipc_port_t memory_manager_default; -#ifndef VM_PAGE_LAUNDRY_MAX -#define VM_PAGE_LAUNDRY_MAX 6 /* outstanding DMM page cleans */ -#endif /* VM_PAGEOUT_LAUNDRY_MAX */ -#ifndef VM_PAGEOUT_BURST_MAX -#define VM_PAGEOUT_BURST_MAX 32 /* simultaneous EMM page cleans */ -#endif /* VM_PAGEOUT_BURST_MAX */ +#ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE +#define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 10000 /* maximum iterations of the active queue to move pages to inactive */ +#endif + +#ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE +#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096 /* maximum iterations of the inactive queue w/o stealing/cleaning a page */ +#endif + +#ifndef VM_PAGEOUT_DEADLOCK_RELIEF +#define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */ +#endif + +#ifndef VM_PAGEOUT_INACTIVE_RELIEF +#define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */ +#endif -#ifndef VM_PAGEOUT_DISCARD_MAX -#define VM_PAGEOUT_DISCARD_MAX 68 /* simultaneous EMM page cleans */ -#endif /* VM_PAGEOUT_DISCARD_MAX */ +#ifndef VM_PAGE_LAUNDRY_MAX +#define VM_PAGE_LAUNDRY_MAX 16UL /* maximum pageouts on a given pageout queue */ +#endif /* VM_PAGEOUT_LAUNDRY_MAX */ #ifndef VM_PAGEOUT_BURST_WAIT #define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds per page */ @@ -107,6 +133,15 @@ extern ipc_port_t memory_manager_default; #define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */ #endif /* VM_PAGEOUT_EMPTY_WAIT */ +#ifndef VM_PAGEOUT_DEADLOCK_WAIT +#define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */ +#endif /* VM_PAGEOUT_DEADLOCK_WAIT */ + +#ifndef VM_PAGEOUT_IDLE_WAIT +#define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */ +#endif /* VM_PAGEOUT_IDLE_WAIT */ + + /* * To obtain a reasonable LRU approximation, the inactive queue * needs to be large enough to give pages on it a chance to be @@ -150,10 +185,30 @@ extern ipc_port_t memory_manager_default; */ #ifndef VM_PAGE_FREE_RESERVED -#define VM_PAGE_FREE_RESERVED \ - ((16 * VM_PAGE_LAUNDRY_MAX) + NCPUS) +#define VM_PAGE_FREE_RESERVED(n) \ + ((6 * VM_PAGE_LAUNDRY_MAX) + (n)) #endif /* VM_PAGE_FREE_RESERVED */ + +/* + * must hold the page queues lock to + * manipulate this structure + */ +struct vm_pageout_queue { + queue_head_t pgo_pending; /* laundry pages to be processed by pager's iothread */ + unsigned int pgo_laundry; /* current count of laundry pages on queue or in flight */ + unsigned int pgo_maxlaundry; + + unsigned int pgo_idle:1, /* iothread is blocked waiting for work to do */ + pgo_busy:1, /* iothread is currently processing request from pgo_pending */ + pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */ + :0; +}; + +#define VM_PAGE_Q_THROTTLED(q) \ + ((q)->pgo_laundry >= (q)->pgo_maxlaundry) + + /* * Exported variable used to broadcast the activation of the pageout scan * Working Set uses this to throttle its use of pmap removes. In this @@ -166,26 +221,27 @@ unsigned int vm_pageout_scan_event_counter = 0; /* * Forward declarations for internal routines. */ + +static void vm_pageout_garbage_collect(int); +static void vm_pageout_iothread_continue(struct vm_pageout_queue *); +static void vm_pageout_iothread_external(void); +static void vm_pageout_iothread_internal(void); +static void vm_pageout_queue_steal(vm_page_t); + extern void vm_pageout_continue(void); extern void vm_pageout_scan(void); -extern void vm_pageout_throttle(vm_page_t m); -extern vm_page_t vm_pageout_cluster_page( - vm_object_t object, - vm_object_offset_t offset, - boolean_t precious_clean); unsigned int vm_pageout_reserved_internal = 0; unsigned int vm_pageout_reserved_really = 0; -unsigned int vm_page_laundry_max = 0; /* # of clusters outstanding */ -unsigned int vm_page_laundry_min = 0; -unsigned int vm_pageout_burst_max = 0; -unsigned int vm_pageout_burst_wait = 0; /* milliseconds per page */ +unsigned int vm_pageout_idle_wait = 0; /* milliseconds */ unsigned int vm_pageout_empty_wait = 0; /* milliseconds */ -unsigned int vm_pageout_burst_min = 0; -unsigned int vm_pageout_pause_count = 0; -unsigned int vm_pageout_pause_max = 0; -unsigned int vm_free_page_pause = 100; /* milliseconds */ +unsigned int vm_pageout_burst_wait = 0; /* milliseconds */ +unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */ +unsigned int vm_pageout_deadlock_relief = 0; +unsigned int vm_pageout_inactive_relief = 0; +unsigned int vm_pageout_burst_active_throttle = 0; +unsigned int vm_pageout_burst_inactive_throttle = 0; /* * Protection against zero fill flushing live working sets derived @@ -215,20 +271,66 @@ unsigned int vm_pageout_inactive_used = 0; /* debugging */ unsigned int vm_pageout_inactive_clean = 0; /* debugging */ unsigned int vm_pageout_inactive_dirty = 0; /* debugging */ unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */ +unsigned int vm_pageout_purged_objects = 0; /* debugging */ unsigned int vm_stat_discard = 0; /* debugging */ unsigned int vm_stat_discard_sent = 0; /* debugging */ unsigned int vm_stat_discard_failure = 0; /* debugging */ unsigned int vm_stat_discard_throttle = 0; /* debugging */ -unsigned int vm_pageout_scan_active_emm_throttle = 0; /* debugging */ -unsigned int vm_pageout_scan_active_emm_throttle_success = 0; /* debugging */ -unsigned int vm_pageout_scan_active_emm_throttle_failure = 0; /* debugging */ -unsigned int vm_pageout_scan_inactive_emm_throttle = 0; /* debugging */ -unsigned int vm_pageout_scan_inactive_emm_throttle_success = 0; /* debugging */ -unsigned int vm_pageout_scan_inactive_emm_throttle_failure = 0; /* debugging */ +unsigned int vm_pageout_scan_active_throttled = 0; +unsigned int vm_pageout_scan_inactive_throttled = 0; +unsigned int vm_pageout_scan_throttle = 0; /* debugging */ +unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */ +unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */ +unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */ +unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */ +unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */ +/* + * Backing store throttle when BS is exhausted + */ +unsigned int vm_backing_store_low = 0; unsigned int vm_pageout_out_of_line = 0; unsigned int vm_pageout_in_place = 0; + +/* + * ENCRYPTED SWAP: + * counters and statistics... + */ +unsigned long vm_page_decrypt_counter = 0; +unsigned long vm_page_decrypt_for_upl_counter = 0; +unsigned long vm_page_encrypt_counter = 0; +unsigned long vm_page_encrypt_abort_counter = 0; +unsigned long vm_page_encrypt_already_encrypted_counter = 0; +boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */ + + +struct vm_pageout_queue vm_pageout_queue_internal; +struct vm_pageout_queue vm_pageout_queue_external; + + +/* + * Routine: vm_backing_store_disable + * Purpose: + * Suspend non-privileged threads wishing to extend + * backing store when we are low on backing store + * (Synchronized by caller) + */ +void +vm_backing_store_disable( + boolean_t disable) +{ + if(disable) { + vm_backing_store_low = 1; + } else { + if(vm_backing_store_low) { + vm_backing_store_low = 0; + thread_wakeup((event_t) &vm_backing_store_low); + } + } +} + + /* * Routine: vm_pageout_object_allocate * Purpose: @@ -252,9 +354,6 @@ vm_pageout_object_allocate( assert(object->pager_ready); - if (object->pager_trusted || object->internal) - vm_pageout_throttle(m); - new_object = vm_object_allocate(size); if (object->pager_trusted) { @@ -274,6 +373,8 @@ vm_pageout_object_allocate( */ vm_object_lock(object); vm_object_paging_begin(object); + vm_page_lock_queues(); + vm_page_unlock_queues(); vm_object_unlock(object); vm_pageout_in_place++; @@ -311,6 +412,7 @@ vm_pageout_object_terminate( vm_object_t object) { vm_object_t shadow_object; + boolean_t shadow_internal; /* * Deal with the deallocation (last reference) of a pageout object @@ -321,6 +423,7 @@ vm_pageout_object_terminate( assert(object->pageout); shadow_object = object->shadow; vm_object_lock(shadow_object); + shadow_internal = shadow_object->internal; while (!queue_empty(&object->memq)) { vm_page_t p, m; @@ -360,15 +463,11 @@ vm_pageout_object_terminate( /* * Handle the trusted pager throttle. + * Also decrement the burst throttle (if external). */ vm_page_lock_queues(); if (m->laundry) { - vm_page_laundry_count--; - m->laundry = FALSE; - if (vm_page_laundry_count < vm_page_laundry_min) { - vm_page_laundry_min = 0; - thread_wakeup((event_t) &vm_page_laundry_count); - } + vm_pageout_throttle_up(m); } /* @@ -390,17 +489,17 @@ vm_pageout_object_terminate( /* * Revoke all access to the page. Since the object is * locked, and the page is busy, this prevents the page - * from being dirtied after the pmap_is_modified() call + * from being dirtied after the pmap_disconnect() call * returns. - */ - pmap_page_protect(m->phys_page, VM_PROT_NONE); - - /* + * * Since the page is left "dirty" but "not modifed", we * can detect whether the page was redirtied during * pageout by checking the modify state. */ - m->dirty = pmap_is_modified(m->phys_page); + if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED) + m->dirty = TRUE; + else + m->dirty = FALSE; if (m->dirty) { CLUSTER_STAT(vm_pageout_target_page_dirtied++;) @@ -433,7 +532,7 @@ vm_pageout_object_terminate( /* We do not re-set m->dirty ! */ /* The page was busy so no extraneous activity */ - /* could have occured. COPY_INTO is a read into the */ + /* could have occurred. COPY_INTO is a read into the */ /* new pages. CLEAN_IN_PLACE does actually write */ /* out the pages but handling outside of this code */ /* will take care of resetting dirty. We clear the */ @@ -532,7 +631,6 @@ vm_pageout_setup( vm_object_offset_t offset; register vm_page_t holding_page; register vm_page_t new_m; - register vm_page_t new_page; boolean_t need_to_wire = FALSE; @@ -779,8 +877,6 @@ void vm_pageout_initialize_page( vm_page_t m) { - vm_map_copy_t copy; - vm_object_t new_object; vm_object_t object; vm_object_offset_t paging_offset; vm_page_t holding_page; @@ -804,28 +900,26 @@ vm_pageout_initialize_page( object = m->object; paging_offset = m->offset + object->paging_offset; vm_object_paging_begin(object); - vm_object_unlock(object); if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) { VM_PAGE_FREE(m); panic("reservation without pageout?"); /* alan */ + vm_object_unlock(object); return; } /* set the page for future call to vm_fault_list_request */ holding_page = NULL; - vm_object_lock(m->object); vm_page_lock_queues(); pmap_clear_modify(m->phys_page); m->dirty = TRUE; - m->busy = TRUE; - m->list_req_pending = TRUE; - m->cleaning = TRUE; + m->busy = TRUE; + m->list_req_pending = TRUE; + m->cleaning = TRUE; m->pageout = TRUE; vm_page_wire(m); vm_page_unlock_queues(); - vm_object_unlock(m->object); - vm_pageout_throttle(m); + vm_object_unlock(object); /* * Write the data to its pager. @@ -856,331 +950,147 @@ boolean_t allow_clustered_pageouts = FALSE; /* * vm_pageout_cluster: * - * Given a page, page it out, and attempt to clean adjacent pages + * Given a page, queue it to the appropriate I/O thread, + * which will page it out and attempt to clean adjacent pages * in the same operation. * - * The page must be busy, and the object unlocked w/ paging reference - * to prevent deallocation or collapse. The page must not be on any - * pageout queue. + * The page must be busy, and the object and queues locked. We will take a + * paging reference to prevent deallocation or collapse when we + * release the object lock back at the call site. The I/O thread + * is responsible for consuming this reference + * + * The page must not be on any pageout queue. */ + void -vm_pageout_cluster( - vm_page_t m) +vm_pageout_cluster(vm_page_t m) { vm_object_t object = m->object; - vm_object_offset_t offset = m->offset; /* from vm_object start */ - vm_object_offset_t paging_offset = m->offset + object->paging_offset; - vm_object_t new_object; - vm_object_offset_t new_offset; - vm_size_t cluster_size; - vm_object_offset_t cluster_offset; /* from memory_object start */ - vm_object_offset_t cluster_lower_bound; /* from vm_object_start */ - vm_object_offset_t cluster_upper_bound; /* from vm_object_start */ - vm_object_offset_t cluster_start, cluster_end;/* from vm_object start */ - vm_object_offset_t offset_within_cluster; - vm_size_t length_of_data; - vm_page_t friend, holding_page; - kern_return_t rc; - boolean_t precious_clean = TRUE; - int pages_in_cluster; - - CLUSTER_STAT(int pages_at_higher_offsets = 0;) - CLUSTER_STAT(int pages_at_lower_offsets = 0;) + struct vm_pageout_queue *q; + XPR(XPR_VM_PAGEOUT, "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n", - (integer_t)object, offset, (integer_t)m, 0, 0); + (integer_t)object, m->offset, (integer_t)m, 0, 0); - CLUSTER_STAT(vm_pageout_cluster_clusters++;) /* * Only a certain kind of page is appreciated here. */ assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0)); assert(!m->cleaning && !m->pageout && !m->inactive && !m->active); - vm_object_lock(object); - cluster_size = object->cluster_size; - - assert(cluster_size >= PAGE_SIZE); - if (cluster_size < PAGE_SIZE) cluster_size = PAGE_SIZE; - assert(object->pager_created && object->pager_initialized); - assert(object->internal || object->pager_ready); - - if (m->precious && !m->dirty) - precious_clean = TRUE; - - if (!object->pager_trusted || !allow_clustered_pageouts) - cluster_size = PAGE_SIZE; - vm_object_unlock(object); - - cluster_offset = paging_offset & (vm_object_offset_t)(cluster_size - 1); - /* bytes from beginning of cluster */ - /* - * Due to unaligned mappings, we have to be careful - * of negative offsets into the VM object. Clip the cluster - * boundary to the VM object, not the memory object. - */ - if (offset > cluster_offset) { - cluster_lower_bound = offset - cluster_offset; - /* from vm_object */ - } else { - cluster_lower_bound = 0; - } - cluster_upper_bound = (offset - cluster_offset) + - (vm_object_offset_t)cluster_size; - - /* set the page for future call to vm_fault_list_request */ - holding_page = NULL; - vm_object_lock(m->object); - vm_page_lock_queues(); - m->busy = TRUE; - m->list_req_pending = TRUE; - m->cleaning = TRUE; - m->pageout = TRUE; - vm_page_wire(m); - vm_page_unlock_queues(); - vm_object_unlock(m->object); - vm_pageout_throttle(m); - - /* - * Search backward for adjacent eligible pages to clean in - * this operation. - */ - - cluster_start = offset; - if (offset) { /* avoid wrap-around at zero */ - for (cluster_start = offset - PAGE_SIZE_64; - cluster_start >= cluster_lower_bound; - cluster_start -= PAGE_SIZE_64) { - assert(cluster_size > PAGE_SIZE); - - vm_object_lock(object); - vm_page_lock_queues(); - - if ((friend = vm_pageout_cluster_page(object, cluster_start, - precious_clean)) == VM_PAGE_NULL) { - vm_page_unlock_queues(); - vm_object_unlock(object); - break; - } - new_offset = (cluster_start + object->paging_offset) - & (cluster_size - 1); - - assert(new_offset < cluster_offset); - m->list_req_pending = TRUE; - m->cleaning = TRUE; -/* do nothing except advance the write request, all we really need to */ -/* do is push the target page and let the code at the other end decide */ -/* what is really the right size */ - if (vm_page_free_count <= vm_page_free_reserved) { - m->busy = TRUE; - m->pageout = TRUE; - vm_page_wire(m); - } - - vm_page_unlock_queues(); - vm_object_unlock(object); - if(m->dirty || m->object->internal) { - CLUSTER_STAT(pages_at_lower_offsets++;) - } - - } - cluster_start += PAGE_SIZE_64; - } - assert(cluster_start >= cluster_lower_bound); - assert(cluster_start <= offset); - /* - * Search forward for adjacent eligible pages to clean in - * this operation. - */ - for (cluster_end = offset + PAGE_SIZE_64; - cluster_end < cluster_upper_bound; - cluster_end += PAGE_SIZE_64) { - assert(cluster_size > PAGE_SIZE); - - vm_object_lock(object); - vm_page_lock_queues(); - - if ((friend = vm_pageout_cluster_page(object, cluster_end, - precious_clean)) == VM_PAGE_NULL) { - vm_page_unlock_queues(); - vm_object_unlock(object); - break; - } - new_offset = (cluster_end + object->paging_offset) - & (cluster_size - 1); - - assert(new_offset < cluster_size); - m->list_req_pending = TRUE; - m->cleaning = TRUE; -/* do nothing except advance the write request, all we really need to */ -/* do is push the target page and let the code at the other end decide */ -/* what is really the right size */ - if (vm_page_free_count <= vm_page_free_reserved) { - m->busy = TRUE; - m->pageout = TRUE; - vm_page_wire(m); - } - - vm_page_unlock_queues(); - vm_object_unlock(object); - - if(m->dirty || m->object->internal) { - CLUSTER_STAT(pages_at_higher_offsets++;) - } - } - assert(cluster_end <= cluster_upper_bound); - assert(cluster_end >= offset + PAGE_SIZE); - /* - * (offset - cluster_offset) is beginning of cluster_object - * relative to vm_object start. + * protect the object from collapse - + * locking in the object's paging_offset. */ - offset_within_cluster = cluster_start - (offset - cluster_offset); - length_of_data = cluster_end - cluster_start; - - assert(offset_within_cluster < cluster_size); - assert((offset_within_cluster + length_of_data) <= cluster_size); - - rc = KERN_SUCCESS; - assert(rc == KERN_SUCCESS); - - pages_in_cluster = length_of_data/PAGE_SIZE; - -#if MACH_CLUSTER_STATS - (cluster_stats[pages_at_lower_offsets].pages_at_lower_offsets)++; - (cluster_stats[pages_at_higher_offsets].pages_at_higher_offsets)++; - (cluster_stats[pages_in_cluster].pages_in_cluster)++; -#endif /* MACH_CLUSTER_STATS */ + vm_object_paging_begin(object); /* - * Send the data to the pager. + * set the page for future call to vm_fault_list_request + * page should already be marked busy */ - paging_offset = cluster_start + object->paging_offset; - - rc = memory_object_data_return(object->pager, - paging_offset, - length_of_data, - !precious_clean, - FALSE); + vm_page_wire(m); + m->list_req_pending = TRUE; + m->cleaning = TRUE; + m->pageout = TRUE; + m->laundry = TRUE; - vm_object_lock(object); - vm_object_paging_end(object); + if (object->internal == TRUE) + q = &vm_pageout_queue_internal; + else + q = &vm_pageout_queue_external; + q->pgo_laundry++; - if (holding_page) { - assert(!object->pager_trusted); - VM_PAGE_FREE(holding_page); - vm_object_paging_end(object); + m->pageout_queue = TRUE; + queue_enter(&q->pgo_pending, m, vm_page_t, pageq); + + if (q->pgo_idle == TRUE) { + q->pgo_idle = FALSE; + thread_wakeup((event_t) &q->pgo_pending); } - - vm_object_unlock(object); } -/* - * Trusted pager throttle. - * Object must be unlocked, page queues must be unlocked. - */ -void -vm_pageout_throttle( - register vm_page_t m) -{ - vm_page_lock_queues(); - assert(!m->laundry); - m->laundry = TRUE; - while (vm_page_laundry_count >= vm_page_laundry_max) { - /* - * Set the threshold for when vm_page_free() - * should wake us up. - */ - vm_page_laundry_min = vm_page_laundry_max/2; - - assert_wait((event_t) &vm_page_laundry_count, THREAD_UNINT); - vm_page_unlock_queues(); - /* - * Pause to let the default pager catch up. - */ - thread_block((void (*)(void)) 0); - vm_page_lock_queues(); - } - vm_page_laundry_count++; - vm_page_unlock_queues(); -} +unsigned long vm_pageout_throttle_up_count = 0; /* - * The global variable vm_pageout_clean_active_pages controls whether - * active pages are considered valid to be cleaned in place during a - * clustered pageout. Performance measurements are necessary to determine - * the best policy. - */ -int vm_pageout_clean_active_pages = 1; -/* - * vm_pageout_cluster_page: [Internal] - * - * return a vm_page_t to the page at (object,offset) if it is appropriate - * to clean in place. Pages that are non-existent, busy, absent, already - * cleaning, or not dirty are not eligible to be cleaned as an adjacent - * page in a cluster. + * A page is back from laundry. See if there are some pages waiting to + * go to laundry and if we can let some of them go now. * - * The object must be locked on entry, and remains locked throughout - * this call. + * Object and page queues must be locked. */ - -vm_page_t -vm_pageout_cluster_page( - vm_object_t object, - vm_object_offset_t offset, - boolean_t precious_clean) +void +vm_pageout_throttle_up( + vm_page_t m) { - vm_page_t m; - - XPR(XPR_VM_PAGEOUT, - "vm_pageout_cluster_page, object 0x%X offset 0x%X\n", - (integer_t)object, offset, 0, 0, 0); - - if ((m = vm_page_lookup(object, offset)) == VM_PAGE_NULL) - return(VM_PAGE_NULL); + struct vm_pageout_queue *q; - if (m->busy || m->absent || m->cleaning || - (m->wire_count != 0) || m->error) - return(VM_PAGE_NULL); + vm_pageout_throttle_up_count++; - if (vm_pageout_clean_active_pages) { - if (!m->active && !m->inactive) return(VM_PAGE_NULL); - } else { - if (!m->inactive) return(VM_PAGE_NULL); - } + assert(m->laundry); + assert(m->object != VM_OBJECT_NULL); + assert(m->object != kernel_object); - assert(!m->private); - assert(!m->fictitious); + if (m->object->internal == TRUE) + q = &vm_pageout_queue_internal; + else + q = &vm_pageout_queue_external; - if (!m->dirty) m->dirty = pmap_is_modified(m->phys_page); + m->laundry = FALSE; + q->pgo_laundry--; - if (precious_clean) { - if (!m->precious || !m->dirty) - return(VM_PAGE_NULL); - } else { - if (!m->dirty) - return(VM_PAGE_NULL); + if (q->pgo_throttled == TRUE) { + q->pgo_throttled = FALSE; + thread_wakeup((event_t) &q->pgo_laundry); } - return(m); } + /* * vm_pageout_scan does the dirty work for the pageout daemon. * It returns with vm_page_queue_free_lock held and * vm_page_free_wanted == 0. */ -extern void vm_pageout_scan_continue(void); /* forward; */ + +#define DELAYED_UNLOCK_LIMIT (3 * MAX_UPL_TRANSFER) + +#define FCS_IDLE 0 +#define FCS_DELAYED 1 +#define FCS_DEADLOCK_DETECTED 2 + +struct flow_control { + int state; + mach_timespec_t ts; +}; + +extern kern_return_t sysclk_gettime(mach_timespec_t *); + void vm_pageout_scan(void) { - unsigned int burst_count; - boolean_t now = FALSE; - unsigned int laundry_pages; - boolean_t need_more_inactive_pages; - unsigned int loop_detect; + unsigned int loop_count = 0; + unsigned int inactive_burst_count = 0; + unsigned int active_burst_count = 0; + vm_page_t local_freeq = 0; + int local_freed = 0; + int delayed_unlock = 0; + int need_internal_inactive = 0; + int refmod_state = 0; + int vm_pageout_deadlock_target = 0; + struct vm_pageout_queue *iq; + struct vm_pageout_queue *eq; + struct flow_control flow_control; + boolean_t active_throttled = FALSE; + boolean_t inactive_throttled = FALSE; + mach_timespec_t ts; + unsigned int msecs = 0; + vm_object_t object; + + + flow_control.state = FCS_IDLE; + iq = &vm_pageout_queue_internal; + eq = &vm_pageout_queue_external; XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0); @@ -1207,145 +1117,140 @@ vm_pageout_scan(void) * When memory is very tight, we can't rely on external pagers to * clean pages. They probably aren't running, because they * aren't vm-privileged. If we kept sending dirty pages to them, - * we could exhaust the free list. However, we can't just ignore - * pages belonging to external objects, because there might be no - * pages belonging to internal objects. Hence, we get the page - * into an internal object and then immediately double-page it, - * sending it to the default pager. - * - * consider_zone_gc should be last, because the other operations - * might return memory to zones. + * we could exhaust the free list. */ + vm_page_lock_queues(); + delayed_unlock = 1; - Restart: - -#if THREAD_SWAPPER - mutex_lock(&vm_page_queue_free_lock); - now = (vm_page_free_count < vm_page_free_min); - mutex_unlock(&vm_page_queue_free_lock); - - swapout_threads(now); -#endif /* THREAD_SWAPPER */ - - stack_collect(); - consider_task_collect(); - consider_thread_collect(); - consider_zone_gc(); - consider_machine_collect(); +Restart: + /* + * Recalculate vm_page_inactivate_target. + */ + vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count + + vm_page_inactive_count); + object = NULL; - loop_detect = vm_page_active_count + vm_page_inactive_count; -#if 0 - if (vm_page_free_count <= vm_page_free_reserved) { - need_more_inactive_pages = TRUE; - } else { - need_more_inactive_pages = FALSE; - } -#else - need_more_inactive_pages = FALSE; -#endif + for (;;) { + vm_page_t m; - for (burst_count = 0;;) { - register vm_page_t m; - register vm_object_t object; + if (delayed_unlock == 0) + vm_page_lock_queues(); - /* - * Recalculate vm_page_inactivate_target. - */ + active_burst_count = vm_page_active_count; - vm_page_lock_queues(); - vm_page_inactive_target = - VM_PAGE_INACTIVE_TARGET(vm_page_active_count + - vm_page_inactive_count); + if (active_burst_count > vm_pageout_burst_active_throttle) + active_burst_count = vm_pageout_burst_active_throttle; /* * Move pages from active to inactive. */ - - while ((vm_page_inactive_count < vm_page_inactive_target || - need_more_inactive_pages) && - !queue_empty(&vm_page_queue_active)) { - register vm_object_t object; + while ((need_internal_inactive || + vm_page_inactive_count < vm_page_inactive_target) && + !queue_empty(&vm_page_queue_active) && + ((active_burst_count--) > 0)) { vm_pageout_active++; + m = (vm_page_t) queue_first(&vm_page_queue_active); + assert(m->active && !m->inactive); + assert(!m->laundry); + assert(m->object != kernel_object); + /* - * If we're getting really low on memory, - * try selecting a page that will go - * directly to the default_pager. - * If there are no such pages, we have to - * page out a page backed by an EMM, - * so that the default_pager can recover - * it eventually. + * Try to lock object; since we've already got the + * page queues lock, we can only 'try' for this one. + * if the 'try' fails, we need to do a mutex_pause + * to allow the owner of the object lock a chance to + * run... otherwise, we're likely to trip over this + * object in the same state as we work our way through + * the queue... clumps of pages associated with the same + * object are fairly typical on the inactive and active queues */ - if (need_more_inactive_pages && - (IP_VALID(memory_manager_default))) { - vm_pageout_scan_active_emm_throttle++; - do { - assert(m->active && !m->inactive); - object = m->object; - - if (vm_object_lock_try(object)) { -#if 0 - if (object->pager_trusted || - object->internal) { - /* found one ! */ - vm_pageout_scan_active_emm_throttle_success++; - goto object_locked_active; - } -#else - vm_pageout_scan_active_emm_throttle_success++; - goto object_locked_active; -#endif - vm_object_unlock(object); - } - m = (vm_page_t) queue_next(&m->pageq); - } while (!queue_end(&vm_page_queue_active, - (queue_entry_t) m)); - if (queue_end(&vm_page_queue_active, - (queue_entry_t) m)) { - vm_pageout_scan_active_emm_throttle_failure++; - m = (vm_page_t) - queue_first(&vm_page_queue_active); + if (m->object != object) { + if (object != NULL) { + vm_object_unlock(object); + object = NULL; } + if (!vm_object_lock_try(m->object)) { + /* + * move page to end of active queue and continue + */ + queue_remove(&vm_page_queue_active, m, + vm_page_t, pageq); + queue_enter(&vm_page_queue_active, m, + vm_page_t, pageq); + + goto done_with_activepage; + } + object = m->object; } - - assert(m->active && !m->inactive); - - object = m->object; - if (!vm_object_lock_try(object)) { - /* - * Move page to end and continue. - */ - - queue_remove(&vm_page_queue_active, m, - vm_page_t, pageq); - queue_enter(&vm_page_queue_active, m, - vm_page_t, pageq); - vm_page_unlock_queues(); - - mutex_pause(); - vm_page_lock_queues(); - continue; - } - - object_locked_active: /* - * If the page is busy, then we pull it - * off the active queue and leave it alone. + * if the page is BUSY, then we pull it + * off the active queue and leave it alone. + * when BUSY is cleared, it will get stuck + * back on the appropriate queue */ - if (m->busy) { - vm_object_unlock(object); queue_remove(&vm_page_queue_active, m, vm_page_t, pageq); - m->active = FALSE; + m->pageq.next = NULL; + m->pageq.prev = NULL; + if (!m->fictitious) vm_page_active_count--; - continue; + m->active = FALSE; + + goto done_with_activepage; } + if (need_internal_inactive) { + /* + * If we're unable to make forward progress + * with the current set of pages on the + * inactive queue due to busy objects or + * throttled pageout queues, then + * move a page that is already clean + * or belongs to a pageout queue that + * isn't currently throttled + */ + active_throttled = FALSE; + if (object->internal) { + if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default))) + active_throttled = TRUE; + } else if (VM_PAGE_Q_THROTTLED(eq)) { + active_throttled = TRUE; + } + if (active_throttled == TRUE) { + if (!m->dirty) { + refmod_state = pmap_get_refmod(m->phys_page); + + if (refmod_state & VM_MEM_REFERENCED) + m->reference = TRUE; + if (refmod_state & VM_MEM_MODIFIED) + m->dirty = TRUE; + } + if (m->dirty || m->precious) { + /* + * page is dirty and targets a THROTTLED queue + * so all we can do is move it back to the + * end of the active queue to get it out + * of the way + */ + queue_remove(&vm_page_queue_active, m, + vm_page_t, pageq); + queue_enter(&vm_page_queue_active, m, + vm_page_t, pageq); + + vm_pageout_scan_active_throttled++; + + goto done_with_activepage; + } + } + vm_pageout_scan_active_throttle_success++; + need_internal_inactive--; + } /* * Deactivate the page while holding the object * locked, so we know the page is still not busy. @@ -1354,77 +1259,205 @@ vm_pageout_scan(void) * absent or fictitious, but vm_page_deactivate * can handle that. */ - vm_page_deactivate(m); - vm_object_unlock(object); - } +done_with_activepage: + if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) { - /* - * We are done if we have met our target *and* - * nobody is still waiting for a page. - */ - if (vm_page_free_count >= vm_page_free_target) { + if (object != NULL) { + vm_object_unlock(object); + object = NULL; + } + if (local_freeq) { + vm_page_free_list(local_freeq); + + local_freeq = 0; + local_freed = 0; + } + delayed_unlock = 0; + vm_page_unlock_queues(); + + mutex_pause(); + vm_page_lock_queues(); + /* + * continue the while loop processing + * the active queue... need to hold + * the page queues lock + */ + continue; + } + } + + + + /********************************************************************** + * above this point we're playing with the active queue + * below this point we're playing with the throttling mechanisms + * and the inactive queue + **********************************************************************/ + + + + /* + * We are done if we have met our target *and* + * nobody is still waiting for a page. + */ + if (vm_page_free_count + local_freed >= vm_page_free_target) { + if (object != NULL) { + vm_object_unlock(object); + object = NULL; + } + if (local_freeq) { + vm_page_free_list(local_freeq); + + local_freeq = 0; + local_freed = 0; + } mutex_lock(&vm_page_queue_free_lock); + if ((vm_page_free_count >= vm_page_free_target) && (vm_page_free_wanted == 0)) { + vm_page_unlock_queues(); - break; + + thread_wakeup((event_t) &vm_pageout_garbage_collect); + return; } mutex_unlock(&vm_page_queue_free_lock); } + + /* * Sometimes we have to pause: * 1) No inactive pages - nothing to do. - * 2) Flow control - wait for untrusted pagers to catch up. + * 2) Flow control - default pageout queue is full + * 3) Loop control - no acceptable pages found on the inactive queue + * within the last vm_pageout_burst_inactive_throttle iterations */ + if ((queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf))) { + vm_pageout_scan_empty_throttle++; + msecs = vm_pageout_empty_wait; + goto vm_pageout_scan_delay; + + } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) { + vm_pageout_scan_burst_throttle++; + msecs = vm_pageout_burst_wait; + goto vm_pageout_scan_delay; + + } else if (VM_PAGE_Q_THROTTLED(iq)) { + + switch (flow_control.state) { + + case FCS_IDLE: +reset_deadlock_timer: + ts.tv_sec = vm_pageout_deadlock_wait / 1000; + ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC; + sysclk_gettime(&flow_control.ts); + ADD_MACH_TIMESPEC(&flow_control.ts, &ts); + + flow_control.state = FCS_DELAYED; + msecs = vm_pageout_deadlock_wait; + + break; + + case FCS_DELAYED: + sysclk_gettime(&ts); + + if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) { + /* + * the pageout thread for the default pager is potentially + * deadlocked since the + * default pager queue has been throttled for more than the + * allowable time... we need to move some clean pages or dirty + * pages belonging to the external pagers if they aren't throttled + * vm_page_free_wanted represents the number of threads currently + * blocked waiting for pages... we'll move one page for each of + * these plus a fixed amount to break the logjam... once we're done + * moving this number of pages, we'll re-enter the FSC_DELAYED state + * with a new timeout target since we have no way of knowing + * whether we've broken the deadlock except through observation + * of the queue associated with the default pager... we need to + * stop moving pagings and allow the system to run to see what + * state it settles into. + */ + vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted; + vm_pageout_scan_deadlock_detected++; + flow_control.state = FCS_DEADLOCK_DETECTED; - if ((queue_empty(&vm_page_queue_inactive) && - (queue_empty(&vm_page_queue_zf))) || - ((--loop_detect) == 0) || - (burst_count >= vm_pageout_burst_max)) { - unsigned int pages, msecs; - int wait_result; + thread_wakeup((event_t) &vm_pageout_garbage_collect); + goto consider_inactive; + } + /* + * just resniff instead of trying + * to compute a new delay time... we're going to be + * awakened immediately upon a laundry completion, + * so we won't wait any longer than necessary + */ + msecs = vm_pageout_idle_wait; + break; - consider_machine_adjust(); - /* - * vm_pageout_burst_wait is msecs/page. - * If there is nothing for us to do, we wait - * at least vm_pageout_empty_wait msecs. - */ - pages = burst_count; - - if (loop_detect == 0) { - printf("Warning: No physical memory suitable for pageout or reclaim, pageout thread temporarily going to sleep\n"); - msecs = vm_free_page_pause; + case FCS_DEADLOCK_DETECTED: + if (vm_pageout_deadlock_target) + goto consider_inactive; + goto reset_deadlock_timer; + + } + vm_pageout_scan_throttle++; + iq->pgo_throttled = TRUE; +vm_pageout_scan_delay: + if (object != NULL) { + vm_object_unlock(object); + object = NULL; } - else { - msecs = burst_count * vm_pageout_burst_wait; + if (local_freeq) { + vm_page_free_list(local_freeq); + + local_freeq = 0; + local_freed = 0; } + assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC); + + counter(c_vm_pageout_scan_block++); - if (queue_empty(&vm_page_queue_inactive) && - queue_empty(&vm_page_queue_zf) && - (msecs < vm_pageout_empty_wait)) - msecs = vm_pageout_empty_wait; vm_page_unlock_queues(); + + thread_block(THREAD_CONTINUE_NULL); - assert_wait_timeout(msecs, THREAD_INTERRUPTIBLE); - counter(c_vm_pageout_scan_block++); + vm_page_lock_queues(); + delayed_unlock = 1; - /* - * Unfortunately, we don't have call_continuation - * so we can't rely on tail-recursion. - */ - wait_result = thread_block((void (*)(void)) 0); - if (wait_result != THREAD_TIMED_OUT) - thread_cancel_timer(); - vm_pageout_scan_continue(); + iq->pgo_throttled = FALSE; + + if (loop_count >= vm_page_inactive_count) { + if (VM_PAGE_Q_THROTTLED(eq) || VM_PAGE_Q_THROTTLED(iq)) { + /* + * Make sure we move enough "appropriate" + * pages to the inactive queue before trying + * again. + */ + need_internal_inactive = vm_pageout_inactive_relief; + } + loop_count = 0; + } + inactive_burst_count = 0; goto Restart; /*NOTREACHED*/ } + + flow_control.state = FCS_IDLE; +consider_inactive: + loop_count++; + inactive_burst_count++; vm_pageout_inactive++; + if (!queue_empty(&vm_page_queue_inactive)) { + m = (vm_page_t) queue_first(&vm_page_queue_inactive); + + if (m->clustered && (m->no_isync == TRUE)) { + goto use_this_page; + } + } if (vm_zf_count < vm_accellerate_zf_pageout_trigger) { vm_zf_iterator = 0; } else { @@ -1433,7 +1466,7 @@ vm_pageout_scan(void) vm_zf_iterator = 0; } } - if(queue_empty(&vm_page_queue_zf) || + if (queue_empty(&vm_page_queue_zf) || (((last_page_zf) || (vm_zf_iterator == 0)) && !queue_empty(&vm_page_queue_inactive))) { m = (vm_page_t) queue_first(&vm_page_queue_inactive); @@ -1442,108 +1475,89 @@ vm_pageout_scan(void) m = (vm_page_t) queue_first(&vm_page_queue_zf); last_page_zf = 1; } +use_this_page: + assert(!m->active && m->inactive); + assert(!m->laundry); + assert(m->object != kernel_object); - if ((vm_page_free_count <= vm_page_free_reserved) && - (IP_VALID(memory_manager_default))) { - /* - * We're really low on memory. Try to select a page that - * would go directly to the default_pager. - * If there are no such pages, we have to page out a - * page backed by an EMM, so that the default_pager - * can recover it eventually. - */ - vm_pageout_scan_inactive_emm_throttle++; - do { - assert(!m->active && m->inactive); - object = m->object; - - if (vm_object_lock_try(object)) { -#if 0 - if (object->pager_trusted || - object->internal) { - /* found one ! */ - vm_pageout_scan_inactive_emm_throttle_success++; - goto object_locked_inactive; - } -#else - vm_pageout_scan_inactive_emm_throttle_success++; - goto object_locked_inactive; -#endif /* 0 */ - vm_object_unlock(object); + /* + * Try to lock object; since we've alread got the + * page queues lock, we can only 'try' for this one. + * if the 'try' fails, we need to do a mutex_pause + * to allow the owner of the object lock a chance to + * run... otherwise, we're likely to trip over this + * object in the same state as we work our way through + * the queue... clumps of pages associated with the same + * object are fairly typical on the inactive and active queues + */ + if (m->object != object) { + if (object != NULL) { + vm_object_unlock(object); + object = NULL; + } + if (!vm_object_lock_try(m->object)) { + /* + * Move page to end and continue. + * Don't re-issue ticket + */ + if (m->zero_fill) { + queue_remove(&vm_page_queue_zf, m, + vm_page_t, pageq); + queue_enter(&vm_page_queue_zf, m, + vm_page_t, pageq); + } else { + queue_remove(&vm_page_queue_inactive, m, + vm_page_t, pageq); + queue_enter(&vm_page_queue_inactive, m, + vm_page_t, pageq); } - m = (vm_page_t) queue_next(&m->pageq); - } while ((!queue_end(&vm_page_queue_zf, - (queue_entry_t) m)) - && (!queue_end(&vm_page_queue_inactive, - (queue_entry_t) m))); - - if ((queue_end(&vm_page_queue_zf, - (queue_entry_t) m)) - || (queue_end(&vm_page_queue_inactive, - (queue_entry_t) m))) { - vm_pageout_scan_inactive_emm_throttle_failure++; + vm_pageout_inactive_nolock++; + /* - * We should check the "active" queue - * for good candidates to page out. + * force us to dump any collected free pages + * and to pause before moving on */ - need_more_inactive_pages = TRUE; + delayed_unlock = DELAYED_UNLOCK_LIMIT + 1; - if(last_page_zf == 0) { - last_page_zf = 1; - vm_zf_iterator = vm_zf_iterator_count - 1; - } else { - last_page_zf = 0; - vm_zf_iterator = vm_zf_iterator_count - 2; - } - vm_page_unlock_queues(); - goto Restart; + goto done_with_inactivepage; } + object = m->object; } - - assert(!m->active && m->inactive); - object = m->object; - /* - * Try to lock object; since we've got the - * page queues lock, we can only try for this one. + * If the page belongs to a purgable object with no pending copies + * against it, then we reap all of the pages in the object + * and note that the object has been "emptied". It'll be up to the + * application the discover this and recreate its contents if desired. */ + if ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE || + object->purgable == VM_OBJECT_PURGABLE_EMPTY) && + object->copy == VM_OBJECT_NULL) { - if (!vm_object_lock_try(object)) { + (void) vm_object_purge(object); + vm_pageout_purged_objects++; /* - * Move page to end and continue. - * Don't re-issue ticket + * we've just taken all of the pages from this object, + * so drop the lock now since we're not going to find + * any more pages belonging to it anytime soon */ - if(m->zero_fill) { - queue_remove(&vm_page_queue_zf, m, - vm_page_t, pageq); - queue_enter(&vm_page_queue_zf, m, - vm_page_t, pageq); - } else { - queue_remove(&vm_page_queue_inactive, m, - vm_page_t, pageq); - queue_enter(&vm_page_queue_inactive, m, - vm_page_t, pageq); - } - vm_page_unlock_queues(); + vm_object_unlock(object); + object = NULL; - mutex_pause(); - vm_pageout_inactive_nolock++; - continue; + inactive_burst_count = 0; + + goto done_with_inactivepage; } - object_locked_inactive: /* - * Paging out pages of objects which pager is being - * created by another thread must be avoided, because - * this thread may claim for memory, thus leading to a - * possible dead lock between it and the pageout thread - * which will wait for pager creation, if such pages are - * finally chosen. The remaining assumption is that there - * will finally be enough available pages in the inactive - * pool to page out in order to satisfy all memory claimed - * by the thread which concurrently creates the pager. + * Paging out pages of external objects which + * are currently being created must be avoided. + * The pager may claim for memory, thus leading to a + * possible dead lock between it and the pageout thread, + * if such pages are finally chosen. The remaining assumption + * is that there will finally be enough available pages in the + * inactive pool to page out in order to satisfy all memory + * claimed by the thread which concurrently creates the pager. */ - if (!object->pager_initialized && object->pager_created) { /* * Move page to end and continue, hoping that @@ -1555,7 +1569,7 @@ vm_pageout_scan(void) * one of its logically adjacent fellows is * targeted. */ - if(m->zero_fill) { + if (m->zero_fill) { queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq); queue_enter(&vm_page_queue_zf, m, @@ -1570,21 +1584,20 @@ vm_pageout_scan(void) last_page_zf = 0; vm_zf_iterator = 1; } - vm_page_unlock_queues(); - vm_object_unlock(object); vm_pageout_inactive_avoid++; - continue; - } + goto done_with_inactivepage; + } /* * Remove the page from the inactive list. */ - - if(m->zero_fill) { + if (m->zero_fill) { queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq); } else { queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq); } + m->pageq.next = NULL; + m->pageq.prev = NULL; m->inactive = FALSE; if (!m->fictitious) vm_page_inactive_count--; @@ -1594,11 +1607,9 @@ vm_pageout_scan(void) * Somebody is already playing with this page. * Leave it off the pageout queues. */ - - vm_page_unlock_queues(); - vm_object_unlock(object); vm_pageout_inactive_busy++; - continue; + + goto done_with_inactivepage; } /* @@ -1607,11 +1618,25 @@ vm_pageout_scan(void) if (m->absent || m->error) { vm_pageout_inactive_absent++; - reclaim_page: - vm_page_free(m); - vm_page_unlock_queues(); - vm_object_unlock(object); - continue; +reclaim_page: + if (vm_pageout_deadlock_target) { + vm_pageout_scan_inactive_throttle_success++; + vm_pageout_deadlock_target--; + } + if (m->tabled) + vm_page_remove(m); /* clears tabled, object, offset */ + if (m->absent) + vm_object_absent_release(object); + + assert(m->pageq.next == NULL && + m->pageq.prev == NULL); + m->pageq.next = (queue_entry_t)local_freeq; + local_freeq = m; + local_freed++; + + inactive_burst_count = 0; + + goto done_with_inactivepage; } assert(!m->private); @@ -1625,222 +1650,163 @@ vm_pageout_scan(void) */ if (m->cleaning) { -#if MACH_CLUSTER_STATS - vm_pageout_cluster_conversions++; -#endif m->busy = TRUE; m->pageout = TRUE; m->dump_cleaning = TRUE; vm_page_wire(m); - vm_object_unlock(object); - vm_page_unlock_queues(); - continue; + + CLUSTER_STAT(vm_pageout_cluster_conversions++); + + inactive_burst_count = 0; + + goto done_with_inactivepage; } /* * If it's being used, reactivate. * (Fictitious pages are either busy or absent.) */ - - if (m->reference || pmap_is_referenced(m->phys_page)) { - vm_pageout_inactive_used++; - reactivate_page: -#if ADVISORY_PAGEOUT - if (m->discard_request) { - m->discard_request = FALSE; - } -#endif /* ADVISORY_PAGEOUT */ - last_page_zf = 0; - vm_object_unlock(object); + if ( (!m->reference) ) { + refmod_state = pmap_get_refmod(m->phys_page); + + if (refmod_state & VM_MEM_REFERENCED) + m->reference = TRUE; + if (refmod_state & VM_MEM_MODIFIED) + m->dirty = TRUE; + } + if (m->reference) { +was_referenced: vm_page_activate(m); VM_STAT(reactivations++); - vm_page_unlock_queues(); - continue; - } -#if ADVISORY_PAGEOUT - if (object->advisory_pageout) { - boolean_t do_throttle; - memory_object_t pager; - vm_object_offset_t discard_offset; - - if (m->discard_request) { - vm_stat_discard_failure++; - goto mandatory_pageout; - } + vm_pageout_inactive_used++; + last_page_zf = 0; + inactive_burst_count = 0; - assert(object->pager_initialized); - m->discard_request = TRUE; - pager = object->pager; + goto done_with_inactivepage; + } - /* system-wide throttle */ - do_throttle = (vm_page_free_count <= - vm_page_free_reserved); + XPR(XPR_VM_PAGEOUT, + "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n", + (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0); -#if 0 - /* - * JMM - Do we need a replacement throttle - * mechanism for pagers? - */ - if (!do_throttle) { - /* throttle on this pager */ - /* XXX lock ordering ? */ - ip_lock(port); - do_throttle= imq_full(&port->ip_messages); - ip_unlock(port); + /* + * we've got a candidate page to steal... + * + * m->dirty is up to date courtesy of the + * preceding check for m->reference... if + * we get here, then m->reference had to be + * FALSE which means we did a pmap_get_refmod + * and updated both m->reference and m->dirty + * + * if it's dirty or precious we need to + * see if the target queue is throtttled + * it if is, we need to skip over it by moving it back + * to the end of the inactive queue + */ + inactive_throttled = FALSE; + + if (m->dirty || m->precious) { + if (object->internal) { + if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default))) + inactive_throttled = TRUE; + } else if (VM_PAGE_Q_THROTTLED(eq)) { + inactive_throttled = TRUE; } -#endif - - if (do_throttle) { - vm_stat_discard_throttle++; -#if 0 - /* ignore this page and skip to next */ - vm_page_unlock_queues(); - vm_object_unlock(object); - continue; -#else - /* force mandatory pageout */ - goto mandatory_pageout; -#endif + } + if (inactive_throttled == TRUE) { + if (m->zero_fill) { + queue_enter(&vm_page_queue_zf, m, + vm_page_t, pageq); + } else { + queue_enter(&vm_page_queue_inactive, m, + vm_page_t, pageq); } + if (!m->fictitious) + vm_page_inactive_count++; + m->inactive = TRUE; - /* proceed with discard_request */ - vm_page_activate(m); - vm_stat_discard++; - VM_STAT(reactivations++); - discard_offset = m->offset + object->paging_offset; - vm_stat_discard_sent++; - vm_page_unlock_queues(); - vm_object_unlock(object); + vm_pageout_scan_inactive_throttled++; -/* - memory_object_discard_request(object->pager, - discard_offset, - PAGE_SIZE); -*/ - continue; + goto done_with_inactivepage; } - mandatory_pageout: -#endif /* ADVISORY_PAGEOUT */ - - XPR(XPR_VM_PAGEOUT, - "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n", - (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0); - /* - * Eliminate all mappings. + * we've got a page that we can steal... + * eliminate all mappings and make sure + * we have the up-to-date modified state + * first take the page BUSY, so that no new + * mappings can be made */ - m->busy = TRUE; - pmap_page_protect(m->phys_page, VM_PROT_NONE); + + /* + * if we need to do a pmap_disconnect then we + * need to re-evaluate m->dirty since the pmap_disconnect + * provides the true state atomically... the + * page was still mapped up to the pmap_disconnect + * and may have been dirtied at the last microsecond + * + * we also check for the page being referenced 'late' + * if it was, we first need to do a WAKEUP_DONE on it + * since we already set m->busy = TRUE, before + * going off to reactivate it + * + * if we don't need the pmap_disconnect, then + * m->dirty is up to date courtesy of the + * earlier check for m->reference... if + * we get here, then m->reference had to be + * FALSE which means we did a pmap_get_refmod + * and updated both m->reference and m->dirty... + */ + if (m->no_isync == FALSE) { + refmod_state = pmap_disconnect(m->phys_page); - if (!m->dirty) - m->dirty = pmap_is_modified(m->phys_page); + if (refmod_state & VM_MEM_MODIFIED) + m->dirty = TRUE; + if (refmod_state & VM_MEM_REFERENCED) { + m->reference = TRUE; + + PAGE_WAKEUP_DONE(m); + goto was_referenced; + } + } /* * If it's clean and not precious, we can free the page. */ - if (!m->dirty && !m->precious) { vm_pageout_inactive_clean++; goto reclaim_page; } - vm_page_unlock_queues(); - - /* - * If there is no memory object for the page, create - * one and hand it to the default pager. - */ + vm_pageout_cluster(m); - if (!object->pager_initialized) - vm_object_collapse(object); - if (!object->pager_initialized) - vm_object_pager_create(object); - if (!object->pager_initialized) { - /* - * Still no pager for the object. - * Reactivate the page. - * - * Should only happen if there is no - * default pager. - */ - vm_page_lock_queues(); - vm_page_activate(m); - vm_page_unlock_queues(); + vm_pageout_inactive_dirty++; - /* - * And we are done with it. - */ - PAGE_WAKEUP_DONE(m); - vm_object_unlock(object); + inactive_burst_count = 0; - /* - * break here to get back to the preemption - * point in the outer loop so that we don't - * spin forever if there is no default pager. - */ - vm_pageout_dirty_no_pager++; - /* - * Well there's no pager, but we can still reclaim - * free pages out of the inactive list. Go back - * to top of loop and look for suitable pages. - */ - continue; - } +done_with_inactivepage: + if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) { - if ((object->pager_initialized) && - (object->pager == MEMORY_OBJECT_NULL)) { - /* - * This pager has been destroyed by either - * memory_object_destroy or vm_object_destroy, and - * so there is nowhere for the page to go. - * Just free the page. - */ - VM_PAGE_FREE(m); - vm_object_unlock(object); - continue; + if (object != NULL) { + vm_object_unlock(object); + object = NULL; + } + if (local_freeq) { + vm_page_free_list(local_freeq); + + local_freeq = 0; + local_freed = 0; + } + delayed_unlock = 0; + vm_page_unlock_queues(); + mutex_pause(); } - - vm_pageout_inactive_dirty++; -/* - if (!object->internal) - burst_count++; -*/ - vm_object_paging_begin(object); - vm_object_unlock(object); - vm_pageout_cluster(m); /* flush it */ + /* + * back to top of pageout scan loop + */ } - consider_machine_adjust(); } -counter(unsigned int c_vm_pageout_scan_continue = 0;) - -void -vm_pageout_scan_continue(void) -{ - /* - * We just paused to let the pagers catch up. - * If vm_page_laundry_count is still high, - * then we aren't waiting long enough. - * If we have paused some vm_pageout_pause_max times without - * adjusting vm_pageout_burst_wait, it might be too big, - * so we decrease it. - */ - - vm_page_lock_queues(); - counter(++c_vm_pageout_scan_continue); - if (vm_page_laundry_count > vm_pageout_burst_min) { - vm_pageout_burst_wait++; - vm_pageout_pause_count = 0; - } else if (++vm_pageout_pause_count > vm_pageout_pause_max) { - vm_pageout_burst_wait = (vm_pageout_burst_wait * 3) / 4; - if (vm_pageout_burst_wait < 1) - vm_pageout_burst_wait = 1; - vm_pageout_pause_count = 0; - } - vm_page_unlock_queues(); -} -void vm_page_free_reserve(int pages); int vm_page_free_count_init; void @@ -1867,167 +1833,364 @@ vm_page_free_reserve( * vm_pageout is the high level pageout daemon. */ - void -vm_pageout(void) +vm_pageout_continue(void) { - thread_t self = current_thread(); - spl_t s; + vm_pageout_scan_event_counter++; + vm_pageout_scan(); + /* we hold vm_page_queue_free_lock now */ + assert(vm_page_free_wanted == 0); + assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT); + mutex_unlock(&vm_page_queue_free_lock); - /* - * Set thread privileges. - */ - self->vm_privilege = TRUE; - stack_privilege(self); + counter(c_vm_pageout_block++); + thread_block((thread_continue_t)vm_pageout_continue); + /*NOTREACHED*/ +} - s = splsched(); - thread_lock(self); - self->priority = BASEPRI_PREEMPT - 1; - set_sched_pri(self, self->priority); - thread_unlock(self); - splx(s); - /* - * Initialize some paging parameters. - */ +/* + * must be called with the + * queues and object locks held + */ +static void +vm_pageout_queue_steal(vm_page_t m) +{ + struct vm_pageout_queue *q; - if (vm_page_laundry_max == 0) - vm_page_laundry_max = VM_PAGE_LAUNDRY_MAX; + if (m->object->internal == TRUE) + q = &vm_pageout_queue_internal; + else + q = &vm_pageout_queue_external; - if (vm_pageout_burst_max == 0) - vm_pageout_burst_max = VM_PAGEOUT_BURST_MAX; + m->laundry = FALSE; + m->pageout_queue = FALSE; + queue_remove(&q->pgo_pending, m, vm_page_t, pageq); - if (vm_pageout_burst_wait == 0) - vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT; + m->pageq.next = NULL; + m->pageq.prev = NULL; - if (vm_pageout_empty_wait == 0) - vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT; + vm_object_paging_end(m->object); - vm_page_free_count_init = vm_page_free_count; - vm_zf_iterator = 0; - /* - * even if we've already called vm_page_free_reserve - * call it again here to insure that the targets are - * accurately calculated (it uses vm_page_free_count_init) - * calling it with an arg of 0 will not change the reserve - * but will re-calculate free_min and free_target - */ - if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED) - vm_page_free_reserve(VM_PAGE_FREE_RESERVED - vm_page_free_reserved); - else - vm_page_free_reserve(0); - - /* - * vm_pageout_scan will set vm_page_inactive_target. - * - * The pageout daemon is never done, so loop forever. - * We should call vm_pageout_scan at least once each - * time we are woken, even if vm_page_free_wanted is - * zero, to check vm_page_free_target and - * vm_page_inactive_target. - */ - for (;;) { - vm_pageout_scan_event_counter++; - vm_pageout_scan(); - /* we hold vm_page_queue_free_lock now */ - assert(vm_page_free_wanted == 0); - assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT); - mutex_unlock(&vm_page_queue_free_lock); - counter(c_vm_pageout_block++); - thread_block((void (*)(void)) 0); - } - /*NOTREACHED*/ + q->pgo_laundry--; } -kern_return_t -vm_pageout_emergency_availability_request() + +#ifdef FAKE_DEADLOCK + +#define FAKE_COUNT 5000 + +int internal_count = 0; +int fake_deadlock = 0; + +#endif + +static void +vm_pageout_iothread_continue(struct vm_pageout_queue *q) { - vm_page_t m; + vm_page_t m = NULL; vm_object_t object; + boolean_t need_wakeup; vm_page_lock_queues(); - m = (vm_page_t) queue_first(&vm_page_queue_inactive); - while (!queue_end(&vm_page_queue_inactive, (queue_entry_t) m)) { - if(m->fictitious) { - m = (vm_page_t) queue_next(&m->pageq); - continue; - } - if (!m->dirty) - m->dirty = pmap_is_modified(m->phys_page); - if(m->dirty || m->busy || m->wire_count || m->absent - || m->precious || m->cleaning - || m->dump_cleaning || m->error - || m->pageout || m->laundry - || m->list_req_pending - || m->overwriting) { - m = (vm_page_t) queue_next(&m->pageq); - continue; - } - object = m->object; + while ( !queue_empty(&q->pgo_pending) ) { - if (vm_object_lock_try(object)) { - if((!object->alive) || - (object->pageout)) { - vm_object_unlock(object); - m = (vm_page_t) queue_next(&m->pageq); - continue; - } - m->busy = TRUE; - pmap_page_protect(m->phys_page, VM_PROT_NONE); - vm_page_free(m); - vm_object_unlock(object); - vm_page_unlock_queues(); - return KERN_SUCCESS; - } - m = (vm_page_t) queue_next(&m->pageq); - } + q->pgo_busy = TRUE; + queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq); + m->pageout_queue = FALSE; + vm_page_unlock_queues(); - m = (vm_page_t) queue_first(&vm_page_queue_active); + m->pageq.next = NULL; + m->pageq.prev = NULL; +#ifdef FAKE_DEADLOCK + if (q == &vm_pageout_queue_internal) { + vm_offset_t addr; + int pg_count; - while (!queue_end(&vm_page_queue_active, (queue_entry_t) m)) { - if(m->fictitious) { - m = (vm_page_t) queue_next(&m->pageq); - continue; - } - if (!m->dirty) - m->dirty = pmap_is_modified(m->phys_page); - if(m->dirty || m->busy || m->wire_count || m->absent - || m->precious || m->cleaning - || m->dump_cleaning || m->error - || m->pageout || m->laundry - || m->list_req_pending - || m->overwriting) { - m = (vm_page_t) queue_next(&m->pageq); - continue; - } - object = m->object; + internal_count++; - if (vm_object_lock_try(object)) { - if((!object->alive) || - (object->pageout)) { - vm_object_unlock(object); - m = (vm_page_t) queue_next(&m->pageq); - continue; - } - m->busy = TRUE; - pmap_page_protect(m->phys_page, VM_PROT_NONE); - vm_page_free(m); - vm_object_unlock(object); - vm_page_unlock_queues(); - return KERN_SUCCESS; - } - m = (vm_page_t) queue_next(&m->pageq); + if ((internal_count == FAKE_COUNT)) { + + pg_count = vm_page_free_count + vm_page_free_reserved; + + if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) { + kmem_free(kernel_map, addr, PAGE_SIZE * pg_count); + } + internal_count = 0; + fake_deadlock++; + } + } +#endif + object = m->object; + + if (!object->pager_initialized) { + vm_object_lock(object); + + /* + * If there is no memory object for the page, create + * one and hand it to the default pager. + */ + + if (!object->pager_initialized) + vm_object_collapse(object, (vm_object_offset_t)0); + if (!object->pager_initialized) + vm_object_pager_create(object); + if (!object->pager_initialized) { + /* + * Still no pager for the object. + * Reactivate the page. + * + * Should only happen if there is no + * default pager. + */ + m->list_req_pending = FALSE; + m->cleaning = FALSE; + m->pageout = FALSE; + vm_page_unwire(m); + + vm_pageout_throttle_up(m); + + vm_page_lock_queues(); + vm_pageout_dirty_no_pager++; + vm_page_activate(m); + vm_page_unlock_queues(); + + /* + * And we are done with it. + */ + PAGE_WAKEUP_DONE(m); + + vm_object_paging_end(object); + vm_object_unlock(object); + + vm_page_lock_queues(); + continue; + } else if (object->pager == MEMORY_OBJECT_NULL) { + /* + * This pager has been destroyed by either + * memory_object_destroy or vm_object_destroy, and + * so there is nowhere for the page to go. + * Just free the page... VM_PAGE_FREE takes + * care of cleaning up all the state... + * including doing the vm_pageout_throttle_up + */ + VM_PAGE_FREE(m); + + vm_object_paging_end(object); + vm_object_unlock(object); + + vm_page_lock_queues(); + continue; + } + vm_object_unlock(object); + } + /* + * we expect the paging_in_progress reference to have + * already been taken on the object before it was added + * to the appropriate pageout I/O queue... this will + * keep the object from being terminated and/or the + * paging_offset from changing until the I/O has + * completed... therefore no need to lock the object to + * pull the paging_offset from it. + * + * Send the data to the pager. + * any pageout clustering happens there + */ + memory_object_data_return(object->pager, + m->offset + object->paging_offset, + PAGE_SIZE, + NULL, + NULL, + FALSE, + FALSE, + 0); + + vm_object_lock(object); + vm_object_paging_end(object); + vm_object_unlock(object); + + vm_page_lock_queues(); } + assert_wait((event_t) q, THREAD_UNINT); + + + if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) { + q->pgo_throttled = FALSE; + need_wakeup = TRUE; + } else + need_wakeup = FALSE; + + q->pgo_busy = FALSE; + q->pgo_idle = TRUE; vm_page_unlock_queues(); - return KERN_FAILURE; + + if (need_wakeup == TRUE) + thread_wakeup((event_t) &q->pgo_laundry); + + thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending); + /*NOTREACHED*/ +} + + +static void +vm_pageout_iothread_external(void) +{ + + vm_pageout_iothread_continue(&vm_pageout_queue_external); + /*NOTREACHED*/ +} + + +static void +vm_pageout_iothread_internal(void) +{ + thread_t self = current_thread(); + + self->options |= TH_OPT_VMPRIV; + + vm_pageout_iothread_continue(&vm_pageout_queue_internal); + /*NOTREACHED*/ +} + +static void +vm_pageout_garbage_collect(int collect) +{ + if (collect) { + stack_collect(); + + /* + * consider_zone_gc should be last, because the other operations + * might return memory to zones. + */ + consider_machine_collect(); + consider_zone_gc(); + + consider_machine_adjust(); + } + + assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT); + + thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1); + /*NOTREACHED*/ +} + + + +void +vm_pageout(void) +{ + thread_t self = current_thread(); + thread_t thread; + kern_return_t result; + spl_t s; + + /* + * Set thread privileges. + */ + s = splsched(); + thread_lock(self); + self->priority = BASEPRI_PREEMPT - 1; + set_sched_pri(self, self->priority); + thread_unlock(self); + splx(s); + + /* + * Initialize some paging parameters. + */ + + if (vm_pageout_idle_wait == 0) + vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT; + + if (vm_pageout_burst_wait == 0) + vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT; + + if (vm_pageout_empty_wait == 0) + vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT; + + if (vm_pageout_deadlock_wait == 0) + vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT; + + if (vm_pageout_deadlock_relief == 0) + vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF; + + if (vm_pageout_inactive_relief == 0) + vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF; + + if (vm_pageout_burst_active_throttle == 0) + vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE; + + if (vm_pageout_burst_inactive_throttle == 0) + vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE; + + /* + * Set kernel task to low backing store privileged + * status + */ + task_lock(kernel_task); + kernel_task->priv_flags |= VM_BACKING_STORE_PRIV; + task_unlock(kernel_task); + + vm_page_free_count_init = vm_page_free_count; + vm_zf_iterator = 0; + /* + * even if we've already called vm_page_free_reserve + * call it again here to insure that the targets are + * accurately calculated (it uses vm_page_free_count_init) + * calling it with an arg of 0 will not change the reserve + * but will re-calculate free_min and free_target + */ + if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) { + vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved); + } else + vm_page_free_reserve(0); + + + queue_init(&vm_pageout_queue_external.pgo_pending); + vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX; + vm_pageout_queue_external.pgo_laundry = 0; + vm_pageout_queue_external.pgo_idle = FALSE; + vm_pageout_queue_external.pgo_busy = FALSE; + vm_pageout_queue_external.pgo_throttled = FALSE; + + queue_init(&vm_pageout_queue_internal.pgo_pending); + vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX; + vm_pageout_queue_internal.pgo_laundry = 0; + vm_pageout_queue_internal.pgo_idle = FALSE; + vm_pageout_queue_internal.pgo_busy = FALSE; + vm_pageout_queue_internal.pgo_throttled = FALSE; + + + result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &thread); + if (result != KERN_SUCCESS) + panic("vm_pageout_iothread_internal: create failed"); + + thread_deallocate(thread); + + + result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_PREEMPT - 1, &thread); + if (result != KERN_SUCCESS) + panic("vm_pageout_iothread_external: create failed"); + + thread_deallocate(thread); + + + result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_PREEMPT - 2, &thread); + if (result != KERN_SUCCESS) + panic("vm_pageout_garbage_collect: create failed"); + + thread_deallocate(thread); + + + vm_pageout_continue(); + /*NOTREACHED*/ } static upl_t upl_create( - int flags, - vm_size_t size) + int flags, + upl_size_t size) { upl_t upl; int page_field_size; /* bit field in word size buf */ @@ -2051,10 +2214,10 @@ upl_create( upl->map_object = NULL; upl->ref_count = 1; upl_lock_init(upl); -#ifdef UBC_DEBUG +#ifdef UPL_DEBUG upl->ubc_alias1 = 0; upl->ubc_alias2 = 0; -#endif /* UBC_DEBUG */ +#endif /* UPL_DEBUG */ return(upl); } @@ -2064,7 +2227,7 @@ upl_destroy( { int page_field_size; /* bit field in word size buf */ -#ifdef UBC_DEBUG +#ifdef UPL_DEBUG { upl_t upl_ele; vm_object_t object; @@ -2083,7 +2246,7 @@ upl_destroy( } vm_object_unlock(object); } -#endif /* UBC_DEBUG */ +#endif /* UPL_DEBUG */ /* drop a reference on the map_object whether or */ /* not a pageout object is inserted */ if(upl->map_object->pageout) @@ -2095,15 +2258,16 @@ upl_destroy( page_field_size = (page_field_size + 3) & 0xFFFFFFFC; } if(upl->flags & UPL_INTERNAL) { - kfree((vm_offset_t)upl, - sizeof(struct upl) + - (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE)) - + page_field_size); + kfree(upl, + sizeof(struct upl) + + (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE)) + + page_field_size); } else { - kfree((vm_offset_t)upl, sizeof(struct upl) + page_field_size); + kfree(upl, sizeof(struct upl) + page_field_size); } } +void uc_upl_dealloc(upl_t upl); __private_extern__ void uc_upl_dealloc( upl_t upl) @@ -2125,6 +2289,16 @@ upl_deallocate( } } +/* + * Statistics about UPL enforcement of copy-on-write obligations. + */ +unsigned long upl_cow = 0; +unsigned long upl_cow_again = 0; +unsigned long upl_cow_contiguous = 0; +unsigned long upl_cow_pages = 0; +unsigned long upl_cow_again_pages = 0; +unsigned long upl_cow_contiguous_pages = 0; + /* * Routine: vm_object_upl_request * Purpose: @@ -2168,28 +2342,42 @@ upl_deallocate( * the vm_objects (cache objects), they support. * */ + __private_extern__ kern_return_t vm_object_upl_request( vm_object_t object, - vm_object_offset_t offset, - vm_size_t size, + vm_object_offset_t offset, + upl_size_t size, upl_t *upl_ptr, upl_page_info_array_t user_page_list, unsigned int *page_list_count, - int cntrl_flags) + int cntrl_flags) { - vm_page_t dst_page; + vm_page_t dst_page = VM_PAGE_NULL; vm_object_offset_t dst_offset = offset; - vm_size_t xfer_size = size; + upl_size_t xfer_size = size; boolean_t do_m_lock = FALSE; boolean_t dirty; + boolean_t hw_dirty; upl_t upl = NULL; - int entry; + unsigned int entry; +#if MACH_CLUSTER_STATS boolean_t encountered_lrp = FALSE; - +#endif vm_page_t alias_page = NULL; int page_ticket; - wpl_array_t lite_list; + int refmod_state; + wpl_array_t lite_list = NULL; + vm_object_t last_copy_object; + + + if (cntrl_flags & ~UPL_VALID_FLAGS) { + /* + * For forward compatibility's sake, + * reject any unknown flag. + */ + return KERN_INVALID_VALUE; + } page_ticket = (cntrl_flags & UPL_PAGE_TICKET_MASK) >> UPL_PAGE_TICKET_SHIFT; @@ -2201,10 +2389,6 @@ vm_object_upl_request( if(cntrl_flags & UPL_SET_INTERNAL) if(page_list_count != NULL) *page_list_count = MAX_UPL_TRANSFER; - if(((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) && - ((page_list_count != NULL) && (*page_list_count != 0) - && *page_list_count < (size/page_size))) - return KERN_INVALID_ARGUMENT; if((!object->internal) && (object->paging_offset != 0)) panic("vm_object_upl_request: vnode object with non-zero paging offset\n"); @@ -2213,17 +2397,21 @@ vm_object_upl_request( return KERN_SUCCESS; } + vm_object_lock(object); + vm_object_paging_begin(object); + vm_object_unlock(object); + if(upl_ptr) { if(cntrl_flags & UPL_SET_INTERNAL) { if(cntrl_flags & UPL_SET_LITE) { - vm_offset_t page_field_size; + uintptr_t page_field_size; upl = upl_create( UPL_CREATE_INTERNAL | UPL_CREATE_LITE, size); user_page_list = (upl_page_info_t *) - (((vm_offset_t)upl) + sizeof(struct upl)); + (((uintptr_t)upl) + sizeof(struct upl)); lite_list = (wpl_array_t) - (((vm_offset_t)user_page_list) + + (((uintptr_t)user_page_list) + ((size/PAGE_SIZE) * sizeof(upl_page_info_t))); page_field_size = ((size/PAGE_SIZE) + 7) >> 3; @@ -2235,16 +2423,15 @@ vm_object_upl_request( } else { upl = upl_create(UPL_CREATE_INTERNAL, size); user_page_list = (upl_page_info_t *) - (((vm_offset_t)upl) - + sizeof(struct upl)); + (((uintptr_t)upl) + sizeof(struct upl)); upl->flags = UPL_INTERNAL; } } else { if(cntrl_flags & UPL_SET_LITE) { - vm_offset_t page_field_size; + uintptr_t page_field_size; upl = upl_create(UPL_CREATE_LITE, size); lite_list = (wpl_array_t) - (((vm_offset_t)upl) + sizeof(struct upl)); + (((uintptr_t)upl) + sizeof(struct upl)); page_field_size = ((size/PAGE_SIZE) + 7) >> 3; page_field_size = (page_field_size + 3) & 0xFFFFFFFC; @@ -2256,23 +2443,62 @@ vm_object_upl_request( } } - if(object->phys_contiguous) { - upl->size = size; - upl->offset = offset + object->paging_offset; - *upl_ptr = upl; - if(user_page_list) { - user_page_list[0].phys_addr = - (offset + object->shadow_offset)>>12; - user_page_list[0].device = TRUE; + if (object->phys_contiguous) { + if ((cntrl_flags & UPL_WILL_MODIFY) && + object->copy != VM_OBJECT_NULL) { + /* Honor copy-on-write obligations */ + + /* + * XXX FBDP + * We could still have a race... + * A is here building the UPL for a write(). + * A pushes the pages to the current copy + * object. + * A returns the UPL to the caller. + * B comes along and establishes another + * private mapping on this object, inserting + * a new copy object between the original + * object and the old copy object. + * B reads a page and gets the original contents + * from the original object. + * A modifies the page in the original object. + * B reads the page again and sees A's changes, + * which is wrong... + * + * The problem is that the pages are not + * marked "busy" in the original object, so + * nothing prevents B from reading it before + * before A's changes are completed. + * + * The "paging_in_progress" might protect us + * from the insertion of a new copy object + * though... To be verified. + */ + vm_object_lock_request(object, + offset, + size, + FALSE, + MEMORY_OBJECT_COPY_SYNC, + VM_PROT_NO_CHANGE); + upl_cow_contiguous++; + upl_cow_contiguous_pages += size >> PAGE_SHIFT; } + upl->map_object = object; /* don't need any shadow mappings for this one */ /* since it is already I/O memory */ upl->flags |= UPL_DEVICE_MEMORY; - vm_object_lock(object); - vm_object_paging_begin(object); - vm_object_unlock(object); + + /* paging_in_progress protects paging_offset */ + upl->offset = offset + object->paging_offset; + upl->size = size; + *upl_ptr = upl; + if(user_page_list) { + user_page_list[0].phys_addr = + (offset + object->shadow_offset)>>PAGE_SHIFT; + user_page_list[0].device = TRUE; + } if(page_list_count != NULL) { if (upl->flags & UPL_INTERNAL) { @@ -2281,14 +2507,21 @@ vm_object_upl_request( *page_list_count = 1; } } + return KERN_SUCCESS; } + if(user_page_list) + user_page_list[0].device = FALSE; + if(cntrl_flags & UPL_SET_LITE) { upl->map_object = object; } else { upl->map_object = vm_object_allocate(size); - vm_object_lock(upl->map_object); + /* + * No neeed to lock the new object: nobody else knows + * about it yet, so it's all ours so far. + */ upl->map_object->shadow = object; upl->map_object->pageout = TRUE; upl->map_object->can_persist = FALSE; @@ -2296,24 +2529,68 @@ vm_object_upl_request( MEMORY_OBJECT_COPY_NONE; upl->map_object->shadow_offset = offset; upl->map_object->wimg_bits = object->wimg_bits; - vm_object_unlock(upl->map_object); } - upl->size = size; - upl->offset = offset + object->paging_offset; - *upl_ptr = upl; + } if (!(cntrl_flags & UPL_SET_LITE)) { VM_PAGE_GRAB_FICTITIOUS(alias_page); } + + /* + * ENCRYPTED SWAP: + * Just mark the UPL as "encrypted" here. + * We'll actually encrypt the pages later, + * in upl_encrypt(), when the caller has + * selected which pages need to go to swap. + */ + if (cntrl_flags & UPL_ENCRYPT) { + upl->flags |= UPL_ENCRYPTED; + } + if (cntrl_flags & UPL_FOR_PAGEOUT) { + upl->flags |= UPL_PAGEOUT; + } vm_object_lock(object); -#ifdef UBC_DEBUG - if(upl_ptr) + + /* we can lock in the paging_offset once paging_in_progress is set */ + if(upl_ptr) { + upl->size = size; + upl->offset = offset + object->paging_offset; + *upl_ptr = upl; +#ifdef UPL_DEBUG queue_enter(&object->uplq, upl, upl_t, uplq); -#endif /* UBC_DEBUG */ - vm_object_paging_begin(object); +#endif /* UPL_DEBUG */ + } + + if ((cntrl_flags & UPL_WILL_MODIFY) && + object->copy != VM_OBJECT_NULL) { + /* Honor copy-on-write obligations */ + + /* + * The caller is gathering these pages and + * might modify their contents. We need to + * make sure that the copy object has its own + * private copies of these pages before we let + * the caller modify them. + */ + vm_object_update(object, + offset, + size, + NULL, + NULL, + FALSE, /* should_return */ + MEMORY_OBJECT_COPY_SYNC, + VM_PROT_NO_CHANGE); + upl_cow++; + upl_cow_pages += size >> PAGE_SHIFT; + + } + /* remember which copy object we synchronized with */ + last_copy_object = object->copy; + entry = 0; if(cntrl_flags & UPL_COPYOUT_FROM) { upl->flags |= UPL_PAGE_SYNC_DONE; + while (xfer_size) { if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) { @@ -2321,44 +2598,88 @@ vm_object_upl_request( VM_PAGE_GRAB_FICTITIOUS(alias_page); vm_object_lock(object); } - if(((dst_page = vm_page_lookup(object, - dst_offset)) == VM_PAGE_NULL) || + if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) || dst_page->fictitious || dst_page->absent || dst_page->error || - (dst_page->wire_count != 0 && - !dst_page->pageout) || - ((!(dst_page->dirty || dst_page->precious || - pmap_is_modified(dst_page->phys_page))) - && (cntrl_flags & UPL_RET_ONLY_DIRTY)) || - ((!(dst_page->inactive)) - && (dst_page->page_ticket != page_ticket) - && ((dst_page->page_ticket+1) != page_ticket) - && (cntrl_flags & UPL_FOR_PAGEOUT)) || - ((!dst_page->list_req_pending) && - (cntrl_flags & UPL_RET_ONLY_DIRTY) && - pmap_is_referenced(dst_page->phys_page))) { - if(user_page_list) { + (dst_page->wire_count && !dst_page->pageout) || + + ((!dst_page->inactive) && (cntrl_flags & UPL_FOR_PAGEOUT) && + (dst_page->page_ticket != page_ticket) && + ((dst_page->page_ticket+1) != page_ticket)) ) { + + if (user_page_list) user_page_list[entry].phys_addr = 0; - user_page_list[entry].device = FALSE; - } - } else { - - if(dst_page->busy && - (!(dst_page->list_req_pending && - dst_page->pageout))) { - if(cntrl_flags & UPL_NOBLOCK) { - if(user_page_list) { + } else { + /* + * grab this up front... + * a high percentange of the time we're going to + * need the hardware modification state a bit later + * anyway... so we can eliminate an extra call into + * the pmap layer by grabbing it here and recording it + */ + refmod_state = pmap_get_refmod(dst_page->phys_page); + + if (cntrl_flags & UPL_RET_ONLY_DIRTY) { + /* + * we're only asking for DIRTY pages to be returned + */ + + if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) { + /* + * if we were the page stolen by vm_pageout_scan to be + * cleaned (as opposed to a buddy being clustered in + * or this request is not being driven by a PAGEOUT cluster + * then we only need to check for the page being diry or + * precious to decide whether to return it + */ + if (dst_page->dirty || dst_page->precious || + (refmod_state & VM_MEM_MODIFIED)) { + goto check_busy; + } + } + /* + * this is a request for a PAGEOUT cluster and this page + * is merely along for the ride as a 'buddy'... not only + * does it have to be dirty to be returned, but it also + * can't have been referenced recently... note that we've + * already filtered above based on whether this page is + * currently on the inactive queue or it meets the page + * ticket (generation count) check + */ + if ( !(refmod_state & VM_MEM_REFERENCED) && + ((refmod_state & VM_MEM_MODIFIED) || + dst_page->dirty || dst_page->precious) ) { + goto check_busy; + } + /* + * if we reach here, we're not to return + * the page... go on to the next one + */ + if (user_page_list) + user_page_list[entry].phys_addr = 0; + entry++; + dst_offset += PAGE_SIZE_64; + xfer_size -= PAGE_SIZE; + continue; + } +check_busy: + if(dst_page->busy && + (!(dst_page->list_req_pending && + dst_page->pageout))) { + if(cntrl_flags & UPL_NOBLOCK) { + if(user_page_list) { user_page_list[entry].phys_addr = 0; - user_page_list[entry].device = FALSE; } entry++; dst_offset += PAGE_SIZE_64; xfer_size -= PAGE_SIZE; continue; } - /*someone else is playing with the */ - /* page. We will have to wait. */ + /* + * someone else is playing with the + * page. We will have to wait. + */ PAGE_SLEEP(object, dst_page, THREAD_UNINT); continue; } @@ -2368,7 +2689,6 @@ vm_object_upl_request( !dst_page->list_req_pending) { if(user_page_list) { user_page_list[entry].phys_addr = 0; - user_page_list[entry].device = FALSE; } entry++; dst_offset += PAGE_SIZE_64; @@ -2379,9 +2699,17 @@ vm_object_upl_request( /* original object and its prodigy */ vm_page_lock_queues(); - if( !(cntrl_flags & UPL_FILE_IO)) { - pmap_page_protect(dst_page->phys_page, VM_PROT_NONE); - } + + if (dst_page->pageout_queue == TRUE) + /* + * we've buddied up a page for a clustered pageout + * that has already been moved to the pageout + * queue by pageout_scan... we need to remove + * it from the queue and drop the laundry count + * on that queue + */ + vm_pageout_queue_steal(dst_page); +#if MACH_CLUSTER_STATS /* pageout statistics gathering. count */ /* all the pages we will page out that */ /* were not counted in the initial */ @@ -2401,7 +2729,7 @@ vm_object_upl_request( (pages_at_lower_offsets++;) } } - +#endif /* Turn off busy indication on pending */ /* pageout. Note: we can only get here */ /* in the request pending case. */ @@ -2409,15 +2737,16 @@ vm_object_upl_request( dst_page->busy = FALSE; dst_page->cleaning = FALSE; - dirty = pmap_is_modified(dst_page->phys_page); - dirty = dirty ? TRUE : dst_page->dirty; + hw_dirty = refmod_state & VM_MEM_MODIFIED; + dirty = hw_dirty ? TRUE : dst_page->dirty; if(cntrl_flags & UPL_SET_LITE) { int pg_num; pg_num = (dst_offset-offset)/PAGE_SIZE; lite_list[pg_num>>5] |= 1 << (pg_num & 31); - pmap_clear_modify(dst_page->phys_page); + if (hw_dirty) + pmap_clear_modify(dst_page->phys_page); /* * Record that this page has been * written out @@ -2439,9 +2768,12 @@ vm_object_upl_request( /* use pageclean setup, it is more */ /* convenient even for the pageout */ /* cases here */ + + vm_object_lock(upl->map_object); vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size); + vm_object_unlock(upl->map_object); alias_page->absent = FALSE; alias_page = NULL; @@ -2455,17 +2787,30 @@ vm_object_upl_request( if(dst_page->pageout) dst_page->busy = TRUE; - if((!(cntrl_flags & UPL_CLEAN_IN_PLACE)) - || (cntrl_flags & UPL_FOR_PAGEOUT)) { - /* deny access to the target page */ - /* while it is being worked on */ - if((!dst_page->pageout) && - (dst_page->wire_count == 0)) { + if ( (cntrl_flags & UPL_ENCRYPT) ) { + /* + * ENCRYPTED SWAP: + * We want to deny access to the target page + * because its contents are about to be + * encrypted and the user would be very + * confused to see encrypted data instead + * of their data. + */ + dst_page->busy = TRUE; + } + if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) { + /* + * deny access to the target page + * while it is being worked on + */ + if ((!dst_page->pageout) && + (dst_page->wire_count == 0)) { dst_page->busy = TRUE; dst_page->pageout = TRUE; vm_page_wire(dst_page); } } + if(user_page_list) { user_page_list[entry].phys_addr = dst_page->phys_page; @@ -2477,11 +2822,30 @@ vm_object_upl_request( dst_page->absent; user_page_list[entry].precious = dst_page->precious; - user_page_list[entry].device = - FALSE; } - vm_page_unlock_queues(); + + /* + * ENCRYPTED SWAP: + * The caller is gathering this page and might + * access its contents later on. Decrypt the + * page before adding it to the UPL, so that + * the caller never sees encrypted data. + */ + if (! (cntrl_flags & UPL_ENCRYPT) && + dst_page->encrypted) { + assert(dst_page->busy); + + vm_page_decrypt(dst_page, 0); + vm_page_decrypt_for_upl_counter++; + + /* + * Retry this page, since anything + * could have changed while we were + * decrypting. + */ + continue; + } } entry++; dst_offset += PAGE_SIZE_64; @@ -2495,7 +2859,50 @@ vm_object_upl_request( VM_PAGE_GRAB_FICTITIOUS(alias_page); vm_object_lock(object); } + + if ((cntrl_flags & UPL_WILL_MODIFY) && + object->copy != last_copy_object) { + /* Honor copy-on-write obligations */ + + /* + * The copy object has changed since we + * last synchronized for copy-on-write. + * Another copy object might have been + * inserted while we released the object's + * lock. Since someone could have seen the + * original contents of the remaining pages + * through that new object, we have to + * synchronize with it again for the remaining + * pages only. The previous pages are "busy" + * so they can not be seen through the new + * mapping. The new mapping will see our + * upcoming changes for those previous pages, + * but that's OK since they couldn't see what + * was there before. It's just a race anyway + * and there's no guarantee of consistency or + * atomicity. We just don't want new mappings + * to see both the *before* and *after* pages. + */ + if (object->copy != VM_OBJECT_NULL) { + vm_object_update( + object, + dst_offset,/* current offset */ + xfer_size, /* remaining size */ + NULL, + NULL, + FALSE, /* should_return */ + MEMORY_OBJECT_COPY_SYNC, + VM_PROT_NO_CHANGE); + upl_cow_again++; + upl_cow_again_pages += + xfer_size >> PAGE_SHIFT; + } + /* remember the copy object we synced with */ + last_copy_object = object->copy; + } + dst_page = vm_page_lookup(object, dst_offset); + if(dst_page != VM_PAGE_NULL) { if((cntrl_flags & UPL_RET_ONLY_ABSENT) && !((dst_page->list_req_pending) @@ -2506,7 +2913,6 @@ vm_object_upl_request( /* already present. */ if(user_page_list) { user_page_list[entry].phys_addr = 0; - user_page_list[entry].device = FALSE; } entry++; dst_offset += PAGE_SIZE_64; @@ -2525,19 +2931,20 @@ vm_object_upl_request( /* dump the fictitious page */ dst_page->list_req_pending = FALSE; dst_page->clustered = FALSE; + vm_page_lock_queues(); vm_page_free(dst_page); vm_page_unlock_queues(); + + dst_page = NULL; } else if ((dst_page->absent && dst_page->list_req_pending)) { /* the default_pager case */ dst_page->list_req_pending = FALSE; dst_page->busy = FALSE; - dst_page->clustered = FALSE; } } - if((dst_page = vm_page_lookup(object, dst_offset)) == - VM_PAGE_NULL) { + if(dst_page == VM_PAGE_NULL) { if(object->private) { /* * This is a nasty wrinkle for users @@ -2550,7 +2957,6 @@ vm_object_upl_request( */ if(user_page_list) { user_page_list[entry].phys_addr = 0; - user_page_list[entry].device = FALSE; } entry++; dst_offset += PAGE_SIZE_64; @@ -2572,6 +2978,18 @@ vm_object_upl_request( dst_page->unlock_request = 0; } #endif + if(cntrl_flags & UPL_RET_ONLY_ABSENT) { + /* + * if UPL_RET_ONLY_ABSENT was specified, + * than we're definitely setting up a + * upl for a clustered read/pagein + * operation... mark the pages as clustered + * so vm_fault can correctly attribute them + * to the 'pagein' bucket the first time + * a fault happens on them + */ + dst_page->clustered = TRUE; + } dst_page->absent = TRUE; object->absent_count++; } @@ -2581,6 +2999,24 @@ vm_object_upl_request( dst_page->unlock_request = 0; } #endif /* 1 */ + + /* + * ENCRYPTED SWAP: + */ + if (cntrl_flags & UPL_ENCRYPT) { + /* + * The page is going to be encrypted when we + * get it from the pager, so mark it so. + */ + dst_page->encrypted = TRUE; + } else { + /* + * Otherwise, the page will not contain + * encrypted data. + */ + dst_page->encrypted = FALSE; + } + dst_page->overwriting = TRUE; if(dst_page->fictitious) { panic("need corner case for fictitious page"); @@ -2599,20 +3035,21 @@ vm_object_upl_request( PAGE_SLEEP(object, dst_page, THREAD_UNINT); continue; } - vm_page_lock_queues(); - if( !(cntrl_flags & UPL_FILE_IO)) { - pmap_page_protect(dst_page->phys_page, VM_PROT_NONE); - } - dirty = pmap_is_modified(dst_page->phys_page); - dirty = dirty ? TRUE : dst_page->dirty; + + if( !(cntrl_flags & UPL_FILE_IO)) + hw_dirty = pmap_disconnect(dst_page->phys_page) & VM_MEM_MODIFIED; + else + hw_dirty = pmap_get_refmod(dst_page->phys_page) & VM_MEM_MODIFIED; + dirty = hw_dirty ? TRUE : dst_page->dirty; if(cntrl_flags & UPL_SET_LITE) { int pg_num; pg_num = (dst_offset-offset)/PAGE_SIZE; lite_list[pg_num>>5] |= 1 << (pg_num & 31); - pmap_clear_modify(dst_page->phys_page); + if (hw_dirty) + pmap_clear_modify(dst_page->phys_page); /* * Record that this page has been * written out @@ -2634,9 +3071,11 @@ vm_object_upl_request( /* use pageclean setup, it is more */ /* convenient even for the pageout */ /* cases here */ + vm_object_lock(upl->map_object); vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size); + vm_object_unlock(upl->map_object); alias_page->absent = FALSE; alias_page = NULL; @@ -2665,8 +3104,22 @@ vm_object_upl_request( } else { vm_page_wire(dst_page); } - /* expect the page to be used */ - dst_page->reference = TRUE; + if(cntrl_flags & UPL_RET_ONLY_ABSENT) { + /* + * expect the page not to be used + * since it's coming in as part + * of a cluster and could be + * speculative... pages that + * are 'consumed' will get a + * hardware reference + */ + dst_page->reference = FALSE; + } else { + /* + * expect the page to be used + */ + dst_page->reference = TRUE; + } dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE; @@ -2681,12 +3134,9 @@ vm_object_upl_request( dst_page->absent; user_page_list[entry].precious = dst_page->precious; - user_page_list[entry].device = - FALSE; } vm_page_unlock_queues(); } - entry++; dst_offset += PAGE_SIZE_64; xfer_size -= PAGE_SIZE; @@ -2727,45 +3177,55 @@ vm_object_upl_request( THREAD_UNINT); if (wait_result != THREAD_AWAKENED) { vm_object_unlock(object); - return(KERN_FAILURE); + return KERN_FAILURE; } continue; } vm_object_unlock(object); - - if (rc = memory_object_data_unlock( + rc = memory_object_data_unlock( object->pager, dst_offset + object->paging_offset, size, - access_required)) { - if (rc == MACH_SEND_INTERRUPTED) - continue; - else - return KERN_FAILURE; - } - break; - + access_required); + if (rc != KERN_SUCCESS && rc != MACH_SEND_INTERRUPTED) + return KERN_FAILURE; + vm_object_lock(object); + + if (rc == KERN_SUCCESS) + break; } + /* lets wait on the last page requested */ /* NOTE: we will have to update lock completed routine to signal */ if(dst_page != VM_PAGE_NULL && (access_required & dst_page->page_lock) != access_required) { PAGE_ASSERT_WAIT(dst_page, THREAD_UNINT); - thread_block((void (*)(void))0); - vm_object_lock(object); + vm_object_unlock(object); + thread_block(THREAD_CONTINUE_NULL); + return KERN_SUCCESS; } } + vm_object_unlock(object); return KERN_SUCCESS; } /* JMM - Backward compatability for now */ kern_return_t +vm_fault_list_request( /* forward */ + memory_object_control_t control, + vm_object_offset_t offset, + upl_size_t size, + upl_t *upl_ptr, + upl_page_info_t **user_page_list_ptr, + int page_list_count, + int cntrl_flags); +kern_return_t vm_fault_list_request( memory_object_control_t control, vm_object_offset_t offset, - vm_size_t size, + upl_size_t size, upl_t *upl_ptr, upl_page_info_t **user_page_list_ptr, int page_list_count, @@ -2817,8 +3277,8 @@ __private_extern__ kern_return_t vm_object_super_upl_request( vm_object_t object, vm_object_offset_t offset, - vm_size_t size, - vm_size_t super_cluster, + upl_size_t size, + upl_size_t super_cluster, upl_t *upl, upl_page_info_t *user_page_list, unsigned int *page_list_count, @@ -2827,11 +3287,17 @@ vm_object_super_upl_request( vm_page_t target_page; int ticket; + if(object->paging_offset > offset) return KERN_FAILURE; + assert(object->paging_in_progress); offset = offset - object->paging_offset; + if(cntrl_flags & UPL_FOR_PAGEOUT) { + + vm_object_lock(object); + if((target_page = vm_page_lookup(object, offset)) != VM_PAGE_NULL) { ticket = target_page->page_ticket; @@ -2840,18 +3306,13 @@ vm_object_super_upl_request( ((ticket << UPL_PAGE_TICKET_SHIFT) & UPL_PAGE_TICKET_MASK); } + vm_object_unlock(object); } - -/* turns off super cluster exercised by the default_pager */ -/* -super_cluster = size; -*/ - if ((super_cluster > size) && - (vm_page_free_count > vm_page_free_reserved)) { + if (super_cluster > size) { vm_object_offset_t base_offset; - vm_size_t super_size; + upl_size_t super_size; base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1)); @@ -2860,31 +3321,261 @@ super_cluster = size; super_size = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size; if(offset > (base_offset + super_size)) - panic("vm_object_super_upl_request: Missed target pageout 0x%x,0x%x, 0x%x, 0x%x, 0x%x, 0x%x\n", offset, base_offset, super_size, super_cluster, size, object->paging_offset); - /* apparently there is a case where the vm requests a */ - /* page to be written out who's offset is beyond the */ - /* object size */ + panic("vm_object_super_upl_request: Missed target pageout" + " %#llx,%#llx, %#x, %#x, %#x, %#llx\n", + offset, base_offset, super_size, super_cluster, + size, object->paging_offset); + /* + * apparently there is a case where the vm requests a + * page to be written out who's offset is beyond the + * object size + */ if((offset + size) > (base_offset + super_size)) super_size = (offset + size) - base_offset; offset = base_offset; size = super_size; } - vm_object_upl_request(object, offset, size, - upl, user_page_list, page_list_count, - cntrl_flags); + return vm_object_upl_request(object, offset, size, + upl, user_page_list, page_list_count, + cntrl_flags); } + +kern_return_t +vm_map_create_upl( + vm_map_t map, + vm_map_address_t offset, + upl_size_t *upl_size, + upl_t *upl, + upl_page_info_array_t page_list, + unsigned int *count, + int *flags) +{ + vm_map_entry_t entry; + int caller_flags; + int force_data_sync; + int sync_cow_data; + vm_object_t local_object; + vm_map_offset_t local_offset; + vm_map_offset_t local_start; + kern_return_t ret; + + caller_flags = *flags; + + if (caller_flags & ~UPL_VALID_FLAGS) { + /* + * For forward compatibility's sake, + * reject any unknown flag. + */ + return KERN_INVALID_VALUE; + } + + force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC); + sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM); + + if(upl == NULL) + return KERN_INVALID_ARGUMENT; + + +REDISCOVER_ENTRY: + vm_map_lock(map); + if (vm_map_lookup_entry(map, offset, &entry)) { + if (entry->object.vm_object == VM_OBJECT_NULL || + !entry->object.vm_object->phys_contiguous) { + if((*upl_size/page_size) > MAX_UPL_TRANSFER) { + *upl_size = MAX_UPL_TRANSFER * page_size; + } + } + if((entry->vme_end - offset) < *upl_size) { + *upl_size = entry->vme_end - offset; + } + if (caller_flags & UPL_QUERY_OBJECT_TYPE) { + if (entry->object.vm_object == VM_OBJECT_NULL) { + *flags = 0; + } else if (entry->object.vm_object->private) { + *flags = UPL_DEV_MEMORY; + if (entry->object.vm_object->phys_contiguous) { + *flags |= UPL_PHYS_CONTIG; + } + } else { + *flags = 0; + } + vm_map_unlock(map); + return KERN_SUCCESS; + } + /* + * Create an object if necessary. + */ + if (entry->object.vm_object == VM_OBJECT_NULL) { + entry->object.vm_object = vm_object_allocate( + (vm_size_t)(entry->vme_end - entry->vme_start)); + entry->offset = 0; + } + if (!(caller_flags & UPL_COPYOUT_FROM)) { + if (!(entry->protection & VM_PROT_WRITE)) { + vm_map_unlock(map); + return KERN_PROTECTION_FAILURE; + } + if (entry->needs_copy) { + vm_map_t local_map; + vm_object_t object; + vm_map_offset_t offset_hi; + vm_map_offset_t offset_lo; + vm_object_offset_t new_offset; + vm_prot_t prot; + boolean_t wired; + vm_behavior_t behavior; + vm_map_version_t version; + vm_map_t real_map; + + local_map = map; + vm_map_lock_write_to_read(map); + if(vm_map_lookup_locked(&local_map, + offset, VM_PROT_WRITE, + &version, &object, + &new_offset, &prot, &wired, + &behavior, &offset_lo, + &offset_hi, &real_map)) { + vm_map_unlock(local_map); + return KERN_FAILURE; + } + if (real_map != map) { + vm_map_unlock(real_map); + } + vm_object_unlock(object); + vm_map_unlock(local_map); + + goto REDISCOVER_ENTRY; + } + } + if (entry->is_sub_map) { + vm_map_t submap; + + submap = entry->object.sub_map; + local_start = entry->vme_start; + local_offset = entry->offset; + vm_map_reference(submap); + vm_map_unlock(map); + + ret = (vm_map_create_upl(submap, + local_offset + (offset - local_start), + upl_size, upl, page_list, count, + flags)); + + vm_map_deallocate(submap); + return ret; + } + + if (sync_cow_data) { + if (entry->object.vm_object->shadow + || entry->object.vm_object->copy) { + + local_object = entry->object.vm_object; + local_start = entry->vme_start; + local_offset = entry->offset; + vm_object_reference(local_object); + vm_map_unlock(map); + + if (entry->object.vm_object->shadow && + entry->object.vm_object->copy) { + vm_object_lock_request( + local_object->shadow, + (vm_object_offset_t) + ((offset - local_start) + + local_offset) + + local_object->shadow_offset, + *upl_size, FALSE, + MEMORY_OBJECT_DATA_SYNC, + VM_PROT_NO_CHANGE); + } + sync_cow_data = FALSE; + vm_object_deallocate(local_object); + goto REDISCOVER_ENTRY; + } + } + + if (force_data_sync) { + + local_object = entry->object.vm_object; + local_start = entry->vme_start; + local_offset = entry->offset; + vm_object_reference(local_object); + vm_map_unlock(map); + + vm_object_lock_request( + local_object, + (vm_object_offset_t) + ((offset - local_start) + local_offset), + (vm_object_size_t)*upl_size, FALSE, + MEMORY_OBJECT_DATA_SYNC, + VM_PROT_NO_CHANGE); + force_data_sync = FALSE; + vm_object_deallocate(local_object); + goto REDISCOVER_ENTRY; + } + + if(!(entry->object.vm_object->private)) { + if(*upl_size > (MAX_UPL_TRANSFER*PAGE_SIZE)) + *upl_size = (MAX_UPL_TRANSFER*PAGE_SIZE); + if(entry->object.vm_object->phys_contiguous) { + *flags = UPL_PHYS_CONTIG; + } else { + *flags = 0; + } + } else { + *flags = UPL_DEV_MEMORY | UPL_PHYS_CONTIG; + } + local_object = entry->object.vm_object; + local_offset = entry->offset; + local_start = entry->vme_start; + vm_object_reference(local_object); + vm_map_unlock(map); + if(caller_flags & UPL_SET_IO_WIRE) { + ret = (vm_object_iopl_request(local_object, + (vm_object_offset_t) + ((offset - local_start) + + local_offset), + *upl_size, + upl, + page_list, + count, + caller_flags)); + } else { + ret = (vm_object_upl_request(local_object, + (vm_object_offset_t) + ((offset - local_start) + + local_offset), + *upl_size, + upl, + page_list, + count, + caller_flags)); + } + vm_object_deallocate(local_object); + return(ret); + } + + vm_map_unlock(map); + return(KERN_FAILURE); +} + +/* + * Internal routine to enter a UPL into a VM map. + * + * JMM - This should just be doable through the standard + * vm_map_enter() API. + */ kern_return_t -vm_upl_map( - vm_map_t map, - upl_t upl, - vm_offset_t *dst_addr) +vm_map_enter_upl( + vm_map_t map, + upl_t upl, + vm_map_offset_t *dst_addr) { - vm_size_t size; + vm_map_size_t size; vm_object_offset_t offset; - vm_offset_t addr; + vm_map_offset_t addr; vm_page_t m; kern_return_t kr; @@ -2910,12 +3601,12 @@ vm_upl_map( if(upl->flags & UPL_INTERNAL) { lite_list = (wpl_array_t) - ((((vm_offset_t)upl) + sizeof(struct upl)) + ((((uintptr_t)upl) + sizeof(struct upl)) + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t))); } else { lite_list = (wpl_array_t) - (((vm_offset_t)upl) + sizeof(struct upl)); + (((uintptr_t)upl) + sizeof(struct upl)); } object = upl->map_object; upl->map_object = vm_object_allocate(upl->size); @@ -2928,11 +3619,12 @@ vm_upl_map( upl->map_object->shadow_offset = upl->offset - object->paging_offset; upl->map_object->wimg_bits = object->wimg_bits; - vm_object_unlock(upl->map_object); offset = upl->map_object->shadow_offset; new_offset = 0; size = upl->size; + vm_object_lock(object); + while(size) { pg_num = (new_offset)/PAGE_SIZE; if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) { @@ -2955,7 +3647,26 @@ vm_upl_map( alias_page->private = TRUE; alias_page->pageout = TRUE; alias_page->phys_page = m->phys_page; + + vm_page_lock_queues(); vm_page_wire(alias_page); + vm_page_unlock_queues(); + + /* + * ENCRYPTED SWAP: + * The virtual page ("m") has to be wired in some way + * here or its physical page ("m->phys_page") could + * be recycled at any time. + * Assuming this is enforced by the caller, we can't + * get an encrypted page here. Since the encryption + * key depends on the VM page's "pager" object and + * the "paging_offset", we couldn't handle 2 pageable + * VM pages (with different pagers and paging_offsets) + * sharing the same physical page: we could end up + * encrypting with one key (via one VM page) and + * decrypting with another key (via the alias VM page). + */ + ASSERT_PAGE_DECRYPTED(m); vm_page_insert(alias_page, upl->map_object, new_offset); @@ -2969,9 +3680,13 @@ vm_upl_map( new_offset += PAGE_SIZE_64; } vm_object_unlock(object); + vm_object_unlock(upl->map_object); } + if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous) + offset = upl->offset - upl->map_object->paging_offset; + else + offset = 0; - offset = 0; /* Always map the entire object */ size = upl->size; vm_object_lock(upl->map_object); @@ -2983,8 +3698,8 @@ vm_upl_map( /* NEED A UPL_MAP ALIAS */ - kr = vm_map_enter(map, dst_addr, size, (vm_offset_t) 0, TRUE, - upl->map_object, offset, FALSE, + kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0, + VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); if (kr != KERN_SUCCESS) { @@ -2992,6 +3707,8 @@ vm_upl_map( return(kr); } + vm_object_lock(upl->map_object); + for(addr=*dst_addr; size > 0; size-=PAGE_SIZE,addr+=PAGE_SIZE) { m = vm_page_lookup(upl->map_object, offset); if(m) { @@ -3004,6 +3721,8 @@ vm_upl_map( } offset+=PAGE_SIZE_64; } + vm_object_unlock(upl->map_object); + upl->ref_count++; /* hold a reference for the mapping */ upl->flags |= UPL_PAGE_LIST_MAPPED; upl->kaddr = *dst_addr; @@ -3011,14 +3730,23 @@ vm_upl_map( return KERN_SUCCESS; } - +/* + * Internal routine to remove a UPL mapping from a VM map. + * + * XXX - This should just be doable through a standard + * vm_map_remove() operation. Otherwise, implicit clean-up + * of the target map won't be able to correctly remove + * these (and release the reference on the UPL). Having + * to do this means we can't map these into user-space + * maps yet. + */ kern_return_t -vm_upl_unmap( +vm_map_remove_upl( vm_map_t map, upl_t upl) { vm_address_t addr; - vm_size_t size; + upl_size_t size; if (upl == UPL_NULL) return KERN_INVALID_ARGUMENT; @@ -3033,7 +3761,10 @@ vm_upl_unmap( upl->kaddr = (vm_offset_t) 0; upl_unlock(upl); - vm_deallocate(map, addr, size); + vm_map_remove( map, + vm_map_trunc_page(addr), + vm_map_round_page(addr + size), + VM_MAP_NO_FLAGS); return KERN_SUCCESS; } upl_unlock(upl); @@ -3043,30 +3774,34 @@ vm_upl_unmap( kern_return_t upl_commit_range( upl_t upl, - vm_offset_t offset, - vm_size_t size, + upl_offset_t offset, + upl_size_t size, int flags, upl_page_info_t *page_list, mach_msg_type_number_t count, boolean_t *empty) { - vm_size_t xfer_size = size; + upl_size_t xfer_size = size; vm_object_t shadow_object; vm_object_t object = upl->map_object; vm_object_offset_t target_offset; int entry; wpl_array_t lite_list; int occupied; + int delayed_unlock = 0; + int clear_refmod = 0; + boolean_t shadow_internal; *empty = FALSE; if (upl == UPL_NULL) return KERN_INVALID_ARGUMENT; + if (count == 0) page_list = NULL; - if(object->pageout) { + if (object->pageout) { shadow_object = object->shadow; } else { shadow_object = object; @@ -3074,90 +3809,146 @@ upl_commit_range( upl_lock(upl); + if (upl->flags & UPL_ACCESS_BLOCKED) { + /* + * We used this UPL to block access to the pages by marking + * them "busy". Now we need to clear the "busy" bit to allow + * access to these pages again. + */ + flags |= UPL_COMMIT_ALLOW_ACCESS; + } + + if (upl->flags & UPL_CLEAR_DIRTY) + flags |= UPL_COMMIT_CLEAR_DIRTY; - if(upl->flags & UPL_DEVICE_MEMORY) { + if (upl->flags & UPL_DEVICE_MEMORY) { xfer_size = 0; } else if ((offset + size) > upl->size) { upl_unlock(upl); return KERN_FAILURE; } - if(upl->flags & UPL_INTERNAL) { + if (upl->flags & UPL_INTERNAL) { lite_list = (wpl_array_t) - ((((vm_offset_t)upl) + sizeof(struct upl)) + ((((uintptr_t)upl) + sizeof(struct upl)) + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t))); } else { lite_list = (wpl_array_t) - (((vm_offset_t)upl) + sizeof(struct upl)); + (((uintptr_t)upl) + sizeof(struct upl)); } - + if (object != shadow_object) + vm_object_lock(object); vm_object_lock(shadow_object); + shadow_internal = shadow_object->internal; + entry = offset/PAGE_SIZE; target_offset = (vm_object_offset_t)offset; - while(xfer_size) { + + while (xfer_size) { vm_page_t t,m; upl_page_info_t *p; m = VM_PAGE_NULL; - if(upl->flags & UPL_LITE) { - int pg_num; + + if (upl->flags & UPL_LITE) { + int pg_num; + pg_num = target_offset/PAGE_SIZE; - if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) { - lite_list[pg_num>>5] &= ~(1 << (pg_num & 31)); + + if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) { + lite_list[pg_num>>5] &= ~(1 << (pg_num & 31)); m = vm_page_lookup(shadow_object, - target_offset + (upl->offset - - shadow_object->paging_offset)); + target_offset + (upl->offset - + shadow_object->paging_offset)); } } - if(object->pageout) { - if ((t = vm_page_lookup(object, target_offset)) - != NULL) { + if (object->pageout) { + if ((t = vm_page_lookup(object, target_offset)) != NULL) { t->pageout = FALSE; + + if (delayed_unlock) { + delayed_unlock = 0; + vm_page_unlock_queues(); + } VM_PAGE_FREE(t); - if(m == NULL) { + + if (m == NULL) { m = vm_page_lookup( shadow_object, target_offset + object->shadow_offset); } - if(m != VM_PAGE_NULL) + if (m != VM_PAGE_NULL) vm_object_paging_end(m->object); } } + if (m != VM_PAGE_NULL) { + + clear_refmod = 0; + + if (upl->flags & UPL_IO_WIRE) { + + if (delayed_unlock == 0) + vm_page_lock_queues(); - if(m != VM_PAGE_NULL) { - if(upl->flags & UPL_IO_WIRE) { - vm_page_lock_queues(); vm_page_unwire(m); - vm_page_unlock_queues(); - if(page_list) { + + if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) { + delayed_unlock = 0; + vm_page_unlock_queues(); + } + if (page_list) { page_list[entry].phys_addr = 0; } if (flags & UPL_COMMIT_SET_DIRTY) { - m->dirty = TRUE; - } else if ((upl->flags & UPL_CLEAR_DIRTY) || - (flags & UPL_COMMIT_CLEAR_DIRTY)) { - pmap_clear_modify(m->phys_page); + m->dirty = TRUE; + } else if (flags & UPL_COMMIT_CLEAR_DIRTY) { m->dirty = FALSE; + clear_refmod |= VM_MEM_MODIFIED; } if (flags & UPL_COMMIT_INACTIVATE) { - vm_page_deactivate(m); m->reference = FALSE; - pmap_clear_reference(m->phys_page); + clear_refmod |= VM_MEM_REFERENCED; + vm_page_deactivate(m); + } + if (clear_refmod) + pmap_clear_refmod(m->phys_page, clear_refmod); + + if (flags & UPL_COMMIT_ALLOW_ACCESS) { + /* + * We blocked access to the pages in this UPL. + * Clear the "busy" bit and wake up any waiter + * for this page. + */ + PAGE_WAKEUP_DONE(m); } + target_offset += PAGE_SIZE_64; xfer_size -= PAGE_SIZE; entry++; continue; } - vm_page_lock_queues(); - if ((upl->flags & UPL_CLEAR_DIRTY) || - (flags & UPL_COMMIT_CLEAR_DIRTY)) { - pmap_clear_modify(m->phys_page); + if (delayed_unlock == 0) + vm_page_lock_queues(); + /* + * make sure to clear the hardware + * modify or reference bits before + * releasing the BUSY bit on this page + * otherwise we risk losing a legitimate + * change of state + */ + if (flags & UPL_COMMIT_CLEAR_DIRTY) { m->dirty = FALSE; + clear_refmod |= VM_MEM_MODIFIED; } - if(page_list) { + if (flags & UPL_COMMIT_INACTIVATE) + clear_refmod |= VM_MEM_REFERENCED; + + if (clear_refmod) + pmap_clear_refmod(m->phys_page, clear_refmod); + + if (page_list) { p = &(page_list[entry]); if(p->phys_addr && p->pageout && !m->pageout) { m->busy = TRUE; @@ -3176,13 +3967,7 @@ upl_commit_range( } m->dump_cleaning = FALSE; if(m->laundry) { - vm_page_laundry_count--; - m->laundry = FALSE; - if (vm_page_laundry_count < vm_page_laundry_min) { - vm_page_laundry_min = 0; - thread_wakeup((event_t) - &vm_page_laundry_count); - } + vm_pageout_throttle_up(m); } if(m->pageout) { m->cleaning = FALSE; @@ -3190,36 +3975,38 @@ upl_commit_range( #if MACH_CLUSTER_STATS if (m->wanted) vm_pageout_target_collisions++; #endif - pmap_page_protect(m->phys_page, VM_PROT_NONE); - m->dirty = pmap_is_modified(m->phys_page); + if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED) + m->dirty = TRUE; + else + m->dirty = FALSE; + if(m->dirty) { - CLUSTER_STAT( - vm_pageout_target_page_dirtied++;) vm_page_unwire(m);/* reactivates */ - VM_STAT(reactivations++); + + if (upl->flags & UPL_PAGEOUT) { + CLUSTER_STAT(vm_pageout_target_page_dirtied++;) + VM_STAT(reactivations++); + } PAGE_WAKEUP_DONE(m); } else { - CLUSTER_STAT( - vm_pageout_target_page_freed++;) vm_page_free(m);/* clears busy, etc. */ - VM_STAT(pageouts++); + + if (upl->flags & UPL_PAGEOUT) { + CLUSTER_STAT(vm_pageout_target_page_freed++;) + + if (page_list[entry].dirty) + VM_STAT(pageouts++); + } } - vm_page_unlock_queues(); + if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) { + delayed_unlock = 0; + vm_page_unlock_queues(); + } target_offset += PAGE_SIZE_64; xfer_size -= PAGE_SIZE; entry++; continue; } - if (flags & UPL_COMMIT_INACTIVATE) { - vm_page_deactivate(m); - m->reference = FALSE; - pmap_clear_reference(m->phys_page); - } else if (!m->active && !m->inactive) { - if (m->reference) - vm_page_activate(m); - else - vm_page_deactivate(m); - } #if MACH_CLUSTER_STATS m->dirty = pmap_is_modified(m->phys_page); @@ -3244,14 +4031,15 @@ upl_commit_range( m->dirty = FALSE; } else if (m->overwriting) { /* alternate request page list, write to - /* page_list case. Occurs when the original - /* page was wired at the time of the list - /* request */ + * page_list case. Occurs when the original + * page was wired at the time of the list + * request */ assert(m->wire_count != 0); vm_page_unwire(m);/* reactivates */ m->overwriting = FALSE; } m->cleaning = FALSE; + /* It is a part of the semantic of COPYOUT_FROM */ /* UPLs that a commit implies cache sync */ /* between the vm page and the backing store */ @@ -3260,22 +4048,47 @@ upl_commit_range( if (upl->flags & UPL_PAGE_SYNC_DONE) m->precious = FALSE; - if (flags & UPL_COMMIT_SET_DIRTY) { - m->dirty = TRUE; + if (flags & UPL_COMMIT_SET_DIRTY) + m->dirty = TRUE; + + if (flags & UPL_COMMIT_INACTIVATE) { + m->reference = FALSE; + vm_page_deactivate(m); + } else if (!m->active && !m->inactive) { + if (m->reference) + vm_page_activate(m); + else + vm_page_deactivate(m); + } + + if (flags & UPL_COMMIT_ALLOW_ACCESS) { + /* + * We blocked access to the pages in this URL. + * Clear the "busy" bit on this page before we + * wake up any waiter. + */ + m->busy = FALSE; } + /* * Wakeup any thread waiting for the page to be un-cleaning. */ PAGE_WAKEUP(m); - vm_page_unlock_queues(); + if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) { + delayed_unlock = 0; + vm_page_unlock_queues(); + } } target_offset += PAGE_SIZE_64; xfer_size -= PAGE_SIZE; entry++; } + if (delayed_unlock) + vm_page_unlock_queues(); occupied = 1; + if (upl->flags & UPL_DEVICE_MEMORY) { occupied = 0; } else if (upl->flags & UPL_LITE) { @@ -3304,6 +4117,8 @@ upl_commit_range( vm_object_paging_end(shadow_object); } vm_object_unlock(shadow_object); + if (object != shadow_object) + vm_object_unlock(object); upl_unlock(upl); return KERN_SUCCESS; @@ -3312,19 +4127,19 @@ upl_commit_range( kern_return_t upl_abort_range( upl_t upl, - vm_offset_t offset, - vm_size_t size, + upl_offset_t offset, + upl_size_t size, int error, boolean_t *empty) { - vm_size_t xfer_size = size; + upl_size_t xfer_size = size; vm_object_t shadow_object; vm_object_t object = upl->map_object; vm_object_offset_t target_offset; - vm_object_offset_t page_offset; int entry; wpl_array_t lite_list; int occupied; + boolean_t shadow_internal; *empty = FALSE; @@ -3350,23 +4165,25 @@ upl_abort_range( upl_unlock(upl); return KERN_FAILURE; } - + if (object != shadow_object) + vm_object_lock(object); vm_object_lock(shadow_object); + shadow_internal = shadow_object->internal; + if(upl->flags & UPL_INTERNAL) { lite_list = (wpl_array_t) - ((((vm_offset_t)upl) + sizeof(struct upl)) + ((((uintptr_t)upl) + sizeof(struct upl)) + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t))); } else { lite_list = (wpl_array_t) - (((vm_offset_t)upl) + sizeof(struct upl)); + (((uintptr_t)upl) + sizeof(struct upl)); } entry = offset/PAGE_SIZE; target_offset = (vm_object_offset_t)offset; while(xfer_size) { vm_page_t t,m; - upl_page_info_t *p; m = VM_PAGE_NULL; if(upl->flags & UPL_LITE) { @@ -3397,6 +4214,8 @@ upl_abort_range( if(m != VM_PAGE_NULL) { vm_page_lock_queues(); if(m->absent) { + boolean_t must_free = TRUE; + /* COPYOUT = FALSE case */ /* check for error conditions which must */ /* be passed back to the pages customer */ @@ -3406,33 +4225,40 @@ upl_abort_range( vm_object_absent_release(m->object); m->page_error = KERN_MEMORY_ERROR; m->error = TRUE; + must_free = FALSE; } else if(error & UPL_ABORT_UNAVAILABLE) { m->restart = FALSE; m->unusual = TRUE; - m->clustered = FALSE; + must_free = FALSE; } else if(error & UPL_ABORT_ERROR) { m->restart = FALSE; m->absent = FALSE; vm_object_absent_release(m->object); m->page_error = KERN_MEMORY_ERROR; m->error = TRUE; - } else if(error & UPL_ABORT_DUMP_PAGES) { - m->clustered = TRUE; - } else { - m->clustered = TRUE; + must_free = FALSE; } - + + /* + * ENCRYPTED SWAP: + * If the page was already encrypted, + * we don't really need to decrypt it + * now. It will get decrypted later, + * on demand, as soon as someone needs + * to access its contents. + */ m->cleaning = FALSE; m->overwriting = FALSE; PAGE_WAKEUP_DONE(m); - if(m->clustered) { + + if (must_free == TRUE) { vm_page_free(m); } else { vm_page_activate(m); } - vm_page_unlock_queues(); + target_offset += PAGE_SIZE_64; xfer_size -= PAGE_SIZE; entry++; @@ -3442,14 +4268,7 @@ upl_abort_range( * Handle the trusted pager throttle. */ if (m->laundry) { - vm_page_laundry_count--; - m->laundry = FALSE; - if (vm_page_laundry_count - < vm_page_laundry_min) { - vm_page_laundry_min = 0; - thread_wakeup((event_t) - &vm_page_laundry_count); - } + vm_pageout_throttle_up(m); } if(m->pageout) { assert(m->busy); @@ -3459,7 +4278,6 @@ upl_abort_range( } m->dump_cleaning = FALSE; m->cleaning = FALSE; - m->busy = FALSE; m->overwriting = FALSE; #if MACH_PAGEMAP vm_external_state_clr( @@ -3467,9 +4285,9 @@ upl_abort_range( #endif /* MACH_PAGEMAP */ if(error & UPL_ABORT_DUMP_PAGES) { vm_page_free(m); - pmap_page_protect(m->phys_page, VM_PROT_NONE); + pmap_disconnect(m->phys_page); } else { - PAGE_WAKEUP(m); + PAGE_WAKEUP_DONE(m); } vm_page_unlock_queues(); } @@ -3506,7 +4324,11 @@ upl_abort_range( vm_object_paging_end(shadow_object); } vm_object_unlock(shadow_object); + if (object != shadow_object) + vm_object_unlock(object); + upl_unlock(upl); + return KERN_SUCCESS; } @@ -3520,10 +4342,11 @@ upl_abort( vm_object_offset_t offset; vm_object_offset_t shadow_offset; vm_object_offset_t target_offset; - int i; + upl_size_t i; wpl_array_t lite_list; vm_page_t t,m; int occupied; + boolean_t shadow_internal; if (upl == UPL_NULL) return KERN_INVALID_ARGUMENT; @@ -3559,14 +4382,20 @@ upl_abort( if(upl->flags & UPL_INTERNAL) { lite_list = (wpl_array_t) - ((((vm_offset_t)upl) + sizeof(struct upl)) + ((((uintptr_t)upl) + sizeof(struct upl)) + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t))); } else { lite_list = (wpl_array_t) - (((vm_offset_t)upl) + sizeof(struct upl)); + (((uintptr_t)upl) + sizeof(struct upl)); } offset = 0; + + if (object != shadow_object) + vm_object_lock(object); vm_object_lock(shadow_object); + + shadow_internal = shadow_object->internal; + for(i = 0; i<(upl->size); i+=PAGE_SIZE, offset += PAGE_SIZE_64) { m = VM_PAGE_NULL; target_offset = offset + shadow_offset; @@ -3594,6 +4423,8 @@ upl_abort( if(m != VM_PAGE_NULL) { vm_page_lock_queues(); if(m->absent) { + boolean_t must_free = TRUE; + /* COPYOUT = FALSE case */ /* check for error conditions which must */ /* be passed back to the pages customer */ @@ -3603,26 +4434,34 @@ upl_abort( vm_object_absent_release(m->object); m->page_error = KERN_MEMORY_ERROR; m->error = TRUE; + must_free = FALSE; } else if(error & UPL_ABORT_UNAVAILABLE) { m->restart = FALSE; m->unusual = TRUE; - m->clustered = FALSE; + must_free = FALSE; } else if(error & UPL_ABORT_ERROR) { m->restart = FALSE; m->absent = FALSE; vm_object_absent_release(m->object); m->page_error = KERN_MEMORY_ERROR; m->error = TRUE; - } else if(error & UPL_ABORT_DUMP_PAGES) { - m->clustered = TRUE; - } else { - m->clustered = TRUE; + must_free = FALSE; } - + + /* + * ENCRYPTED SWAP: + * If the page was already encrypted, + * we don't really need to decrypt it + * now. It will get decrypted later, + * on demand, as soon as someone needs + * to access its contents. + */ + m->cleaning = FALSE; m->overwriting = FALSE; PAGE_WAKEUP_DONE(m); - if(m->clustered) { + + if (must_free == TRUE) { vm_page_free(m); } else { vm_page_activate(m); @@ -3634,14 +4473,7 @@ upl_abort( * Handle the trusted pager throttle. */ if (m->laundry) { - vm_page_laundry_count--; - m->laundry = FALSE; - if (vm_page_laundry_count - < vm_page_laundry_min) { - vm_page_laundry_min = 0; - thread_wakeup((event_t) - &vm_page_laundry_count); - } + vm_pageout_throttle_up(m); } if(m->pageout) { assert(m->busy); @@ -3651,7 +4483,6 @@ upl_abort( } m->dump_cleaning = FALSE; m->cleaning = FALSE; - m->busy = FALSE; m->overwriting = FALSE; #if MACH_PAGEMAP vm_external_state_clr( @@ -3659,9 +4490,9 @@ upl_abort( #endif /* MACH_PAGEMAP */ if(error & UPL_ABORT_DUMP_PAGES) { vm_page_free(m); - pmap_page_protect(m->phys_page, VM_PROT_NONE); + pmap_disconnect(m->phys_page); } else { - PAGE_WAKEUP(m); + PAGE_WAKEUP_DONE(m); } vm_page_unlock_queues(); } @@ -3671,12 +4502,12 @@ upl_abort( occupied = 0; } else if (upl->flags & UPL_LITE) { int pg_num; - int i; + int j; pg_num = upl->size/PAGE_SIZE; pg_num = (pg_num + 31) >> 5; occupied = 0; - for(i= 0; iflags & UPL_DEVICE_MEMORY) page_list = NULL; + if (upl->flags & UPL_ENCRYPTED) { + /* + * ENCRYPTED SWAP: + * This UPL was encrypted, but we don't need + * to decrypt here. We'll decrypt each page + * later, on demand, as soon as someone needs + * to access the page's contents. + */ + } + if ((upl->flags & UPL_CLEAR_DIRTY) || (upl->flags & UPL_PAGE_SYNC_DONE) || page_list) { vm_object_t shadow_object = upl->map_object->shadow; vm_object_t object = upl->map_object; vm_object_offset_t target_offset; - vm_size_t xfer_end; + upl_size_t xfer_end; int entry; vm_page_t t, m; upl_page_info_t *p; + if (object != shadow_object) + vm_object_lock(object); vm_object_lock(shadow_object); entry = 0; @@ -3748,6 +4594,15 @@ upl_commit( m = vm_page_lookup(shadow_object, target_offset); if(m != VM_PAGE_NULL) { + /* + * ENCRYPTED SWAP: + * If this page was encrypted, we + * don't need to decrypt it here. + * We'll decrypt it later, on demand, + * as soon as someone needs to access + * its contents. + */ + if (upl->flags & UPL_CLEAR_DIRTY) { pmap_clear_modify(m->phys_page); m->dirty = FALSE; @@ -3787,8 +4642,10 @@ upl_commit( target_offset += PAGE_SIZE_64; entry++; } - vm_object_unlock(shadow_object); + if (object != shadow_object) + vm_object_unlock(object); + } if (upl->flags & UPL_DEVICE_MEMORY) { vm_object_lock(upl->map_object->shadow); @@ -3806,7 +4663,7 @@ kern_return_t vm_object_iopl_request( vm_object_t object, vm_object_offset_t offset, - vm_size_t size, + upl_size_t size, upl_t *upl_ptr, upl_page_info_array_t user_page_list, unsigned int *page_list_count, @@ -3814,22 +4671,43 @@ vm_object_iopl_request( { vm_page_t dst_page; vm_object_offset_t dst_offset = offset; - vm_size_t xfer_size = size; + upl_size_t xfer_size = size; upl_t upl = NULL; - int entry; - wpl_array_t lite_list; + unsigned int entry; + wpl_array_t lite_list = NULL; int page_field_size; - + int delayed_unlock = 0; + int no_zero_fill = FALSE; vm_page_t alias_page = NULL; kern_return_t ret; vm_prot_t prot; - if(cntrl_flags & UPL_COPYOUT_FROM) { + if (cntrl_flags & ~UPL_VALID_FLAGS) { + /* + * For forward compatibility's sake, + * reject any unknown flag. + */ + return KERN_INVALID_VALUE; + } + + if (cntrl_flags & UPL_ENCRYPT) { + /* + * ENCRYPTED SWAP: + * The paging path doesn't use this interface, + * so we don't support the UPL_ENCRYPT flag + * here. We won't encrypt the pages. + */ + assert(! (cntrl_flags & UPL_ENCRYPT)); + } + + if (cntrl_flags & UPL_NOZEROFILL) + no_zero_fill = TRUE; + + if (cntrl_flags & UPL_COPYOUT_FROM) prot = VM_PROT_READ; - } else { + else prot = VM_PROT_READ | VM_PROT_WRITE; - } if(((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous) { size = MAX_UPL_TRANSFER * page_size; @@ -3859,9 +4737,9 @@ vm_object_iopl_request( UPL_CREATE_INTERNAL | UPL_CREATE_LITE, size); user_page_list = (upl_page_info_t *) - (((vm_offset_t)upl) + sizeof(struct upl)); + (((uintptr_t)upl) + sizeof(struct upl)); lite_list = (wpl_array_t) - (((vm_offset_t)user_page_list) + + (((uintptr_t)user_page_list) + ((size/PAGE_SIZE) * sizeof(upl_page_info_t))); page_field_size = ((size/PAGE_SIZE) + 7) >> 3; @@ -3873,7 +4751,7 @@ vm_object_iopl_request( } else { upl = upl_create(UPL_CREATE_INTERNAL, size); user_page_list = (upl_page_info_t *) - (((vm_offset_t)upl) + (((uintptr_t)upl) + sizeof(struct upl)); upl->flags = UPL_INTERNAL | UPL_IO_WIRE; } @@ -3881,7 +4759,7 @@ vm_object_iopl_request( if(cntrl_flags & UPL_SET_LITE) { upl = upl_create(UPL_CREATE_LITE, size); lite_list = (wpl_array_t) - (((vm_offset_t)upl) + sizeof(struct upl)); + (((uintptr_t)upl) + sizeof(struct upl)); page_field_size = ((size/PAGE_SIZE) + 7) >> 3; page_field_size = (page_field_size + 3) & 0xFFFFFFFC; @@ -3894,14 +4772,6 @@ vm_object_iopl_request( } if(object->phys_contiguous) { - upl->size = size; - upl->offset = offset + object->paging_offset; - *upl_ptr = upl; - if(user_page_list) { - user_page_list[0].phys_addr = - (offset + object->shadow_offset)>>12; - user_page_list[0].device = TRUE; - } upl->map_object = object; /* don't need any shadow mappings for this one */ /* since it is already I/O memory */ @@ -3911,6 +4781,16 @@ vm_object_iopl_request( vm_object_paging_begin(object); vm_object_unlock(object); + /* paging in progress also protects the paging_offset */ + upl->offset = offset + object->paging_offset; + upl->size = size; + *upl_ptr = upl; + if(user_page_list) { + user_page_list[0].phys_addr = + (offset + object->shadow_offset)>>PAGE_SHIFT; + user_page_list[0].device = TRUE; + } + if(page_list_count != NULL) { if (upl->flags & UPL_INTERNAL) { *page_list_count = 0; @@ -3920,7 +4800,8 @@ vm_object_iopl_request( } return KERN_SUCCESS; } - + if(user_page_list) + user_page_list[0].device = FALSE; if(cntrl_flags & UPL_SET_LITE) { upl->map_object = object; @@ -3936,214 +4817,1153 @@ vm_object_iopl_request( upl->map_object->wimg_bits = object->wimg_bits; vm_object_unlock(upl->map_object); } + } + vm_object_lock(object); + vm_object_paging_begin(object); + + if (!object->phys_contiguous) { + /* Protect user space from future COW operations */ + object->true_share = TRUE; + if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) + object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; + } + + /* we can lock the upl offset now that paging_in_progress is set */ + if(upl_ptr) { upl->size = size; upl->offset = offset + object->paging_offset; *upl_ptr = upl; +#ifdef UPL_DEBUG + queue_enter(&object->uplq, upl, upl_t, uplq); +#endif /* UPL_DEBUG */ + } + + if (cntrl_flags & UPL_BLOCK_ACCESS) { + /* + * The user requested that access to the pages in this URL + * be blocked until the UPL is commited or aborted. + */ + upl->flags |= UPL_ACCESS_BLOCKED; + } + + entry = 0; + while (xfer_size) { + if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) { + if (delayed_unlock) { + delayed_unlock = 0; + vm_page_unlock_queues(); + } + vm_object_unlock(object); + VM_PAGE_GRAB_FICTITIOUS(alias_page); + vm_object_lock(object); + } + dst_page = vm_page_lookup(object, dst_offset); + + /* + * ENCRYPTED SWAP: + * If the page is encrypted, we need to decrypt it, + * so force a soft page fault. + */ + if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) || + (dst_page->encrypted) || + (dst_page->unusual && (dst_page->error || + dst_page->restart || + dst_page->absent || + dst_page->fictitious || + (prot & dst_page->page_lock)))) { + vm_fault_return_t result; + do { + vm_page_t top_page; + kern_return_t error_code; + int interruptible; + + vm_object_offset_t lo_offset = offset; + vm_object_offset_t hi_offset = offset + size; + + + if (delayed_unlock) { + delayed_unlock = 0; + vm_page_unlock_queues(); + } + + if(cntrl_flags & UPL_SET_INTERRUPTIBLE) { + interruptible = THREAD_ABORTSAFE; + } else { + interruptible = THREAD_UNINT; + } + + result = vm_fault_page(object, dst_offset, + prot | VM_PROT_WRITE, FALSE, + interruptible, + lo_offset, hi_offset, + VM_BEHAVIOR_SEQUENTIAL, + &prot, &dst_page, &top_page, + (int *)0, + &error_code, no_zero_fill, FALSE, NULL, 0); + + switch(result) { + case VM_FAULT_SUCCESS: + + PAGE_WAKEUP_DONE(dst_page); + + /* + * Release paging references and + * top-level placeholder page, if any. + */ + + if(top_page != VM_PAGE_NULL) { + vm_object_t local_object; + local_object = + top_page->object; + if(top_page->object + != dst_page->object) { + vm_object_lock( + local_object); + VM_PAGE_FREE(top_page); + vm_object_paging_end( + local_object); + vm_object_unlock( + local_object); + } else { + VM_PAGE_FREE(top_page); + vm_object_paging_end( + local_object); + } + } + + break; + + + case VM_FAULT_RETRY: + vm_object_lock(object); + vm_object_paging_begin(object); + break; + + case VM_FAULT_FICTITIOUS_SHORTAGE: + vm_page_more_fictitious(); + vm_object_lock(object); + vm_object_paging_begin(object); + break; + + case VM_FAULT_MEMORY_SHORTAGE: + if (vm_page_wait(interruptible)) { + vm_object_lock(object); + vm_object_paging_begin(object); + break; + } + /* fall thru */ + + case VM_FAULT_INTERRUPTED: + error_code = MACH_SEND_INTERRUPTED; + case VM_FAULT_MEMORY_ERROR: + ret = (error_code ? error_code: + KERN_MEMORY_ERROR); + vm_object_lock(object); + for(; offset < dst_offset; + offset += PAGE_SIZE) { + dst_page = vm_page_lookup( + object, offset); + if(dst_page == VM_PAGE_NULL) + panic("vm_object_iopl_request: Wired pages missing. \n"); + vm_page_lock_queues(); + vm_page_unwire(dst_page); + vm_page_unlock_queues(); + VM_STAT(reactivations++); + } + vm_object_unlock(object); + upl_destroy(upl); + return ret; + } + } while ((result != VM_FAULT_SUCCESS) + || (result == VM_FAULT_INTERRUPTED)); + } + if (delayed_unlock == 0) + vm_page_lock_queues(); + vm_page_wire(dst_page); + + if (cntrl_flags & UPL_BLOCK_ACCESS) { + /* + * Mark the page "busy" to block any future page fault + * on this page. We'll also remove the mapping + * of all these pages before leaving this routine. + */ + assert(!dst_page->fictitious); + dst_page->busy = TRUE; + } + + if (upl_ptr) { + if (cntrl_flags & UPL_SET_LITE) { + int pg_num; + pg_num = (dst_offset-offset)/PAGE_SIZE; + lite_list[pg_num>>5] |= 1 << (pg_num & 31); + } else { + /* + * Convert the fictitious page to a + * private shadow of the real page. + */ + assert(alias_page->fictitious); + alias_page->fictitious = FALSE; + alias_page->private = TRUE; + alias_page->pageout = TRUE; + alias_page->phys_page = dst_page->phys_page; + vm_page_wire(alias_page); + + vm_page_insert(alias_page, + upl->map_object, size - xfer_size); + assert(!alias_page->wanted); + alias_page->busy = FALSE; + alias_page->absent = FALSE; + } + + /* expect the page to be used */ + dst_page->reference = TRUE; + + if (!(cntrl_flags & UPL_COPYOUT_FROM)) + dst_page->dirty = TRUE; + alias_page = NULL; + + if (user_page_list) { + user_page_list[entry].phys_addr + = dst_page->phys_page; + user_page_list[entry].dirty = + dst_page->dirty; + user_page_list[entry].pageout = + dst_page->pageout; + user_page_list[entry].absent = + dst_page->absent; + user_page_list[entry].precious = + dst_page->precious; + } + } + if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) { + delayed_unlock = 0; + vm_page_unlock_queues(); + } + entry++; + dst_offset += PAGE_SIZE_64; + xfer_size -= PAGE_SIZE; + } + if (delayed_unlock) + vm_page_unlock_queues(); + + if (upl->flags & UPL_INTERNAL) { + if(page_list_count != NULL) + *page_list_count = 0; + } else if (*page_list_count > entry) { + if(page_list_count != NULL) + *page_list_count = entry; + } + + if (alias_page != NULL) { + vm_page_lock_queues(); + vm_page_free(alias_page); + vm_page_unlock_queues(); + } + + vm_object_unlock(object); + + if (cntrl_flags & UPL_BLOCK_ACCESS) { + /* + * We've marked all the pages "busy" so that future + * page faults will block. + * Now remove the mapping for these pages, so that they + * can't be accessed without causing a page fault. + */ + vm_object_pmap_protect(object, offset, (vm_object_size_t)size, + PMAP_NULL, 0, VM_PROT_NONE); + } + + return KERN_SUCCESS; +} + +kern_return_t +upl_transpose( + upl_t upl1, + upl_t upl2) +{ + kern_return_t retval; + boolean_t upls_locked; + vm_object_t object1, object2; + + if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) { + return KERN_INVALID_ARGUMENT; + } + + upls_locked = FALSE; + + /* + * Since we need to lock both UPLs at the same time, + * avoid deadlocks by always taking locks in the same order. + */ + if (upl1 < upl2) { + upl_lock(upl1); + upl_lock(upl2); + } else { + upl_lock(upl2); + upl_lock(upl1); + } + upls_locked = TRUE; /* the UPLs will need to be unlocked */ + + object1 = upl1->map_object; + object2 = upl2->map_object; + + if (upl1->offset != 0 || upl2->offset != 0 || + upl1->size != upl2->size) { + /* + * We deal only with full objects, not subsets. + * That's because we exchange the entire backing store info + * for the objects: pager, resident pages, etc... We can't do + * only part of it. + */ + retval = KERN_INVALID_VALUE; + goto done; + } + + /* + * Tranpose the VM objects' backing store. + */ + retval = vm_object_transpose(object1, object2, + (vm_object_size_t) upl1->size); + + if (retval == KERN_SUCCESS) { + /* + * Make each UPL point to the correct VM object, i.e. the + * object holding the pages that the UPL refers to... + */ + upl1->map_object = object2; + upl2->map_object = object1; + } + +done: + /* + * Cleanup. + */ + if (upls_locked) { + upl_unlock(upl1); + upl_unlock(upl2); + upls_locked = FALSE; + } + + return retval; +} + +/* + * ENCRYPTED SWAP: + * + * Rationale: the user might have some encrypted data on disk (via + * FileVault or any other mechanism). That data is then decrypted in + * memory, which is safe as long as the machine is secure. But that + * decrypted data in memory could be paged out to disk by the default + * pager. The data would then be stored on disk in clear (not encrypted) + * and it could be accessed by anyone who gets physical access to the + * disk (if the laptop or the disk gets stolen for example). This weakens + * the security offered by FileVault. + * + * Solution: the default pager will optionally request that all the + * pages it gathers for pageout be encrypted, via the UPL interfaces, + * before it sends this UPL to disk via the vnode_pageout() path. + * + * Notes: + * + * To avoid disrupting the VM LRU algorithms, we want to keep the + * clean-in-place mechanisms, which allow us to send some extra pages to + * swap (clustering) without actually removing them from the user's + * address space. We don't want the user to unknowingly access encrypted + * data, so we have to actually remove the encrypted pages from the page + * table. When the user accesses the data, the hardware will fail to + * locate the virtual page in its page table and will trigger a page + * fault. We can then decrypt the page and enter it in the page table + * again. Whenever we allow the user to access the contents of a page, + * we have to make sure it's not encrypted. + * + * + */ +/* + * ENCRYPTED SWAP: + * Reserve of virtual addresses in the kernel address space. + * We need to map the physical pages in the kernel, so that we + * can call the encryption/decryption routines with a kernel + * virtual address. We keep this pool of pre-allocated kernel + * virtual addresses so that we don't have to scan the kernel's + * virtaul address space each time we need to encrypt or decrypt + * a physical page. + * It would be nice to be able to encrypt and decrypt in physical + * mode but that might not always be more efficient... + */ +decl_simple_lock_data(,vm_paging_lock) +#define VM_PAGING_NUM_PAGES 64 +vm_map_offset_t vm_paging_base_address = 0; +boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, }; +int vm_paging_max_index = 0; +unsigned long vm_paging_no_kernel_page = 0; +unsigned long vm_paging_objects_mapped = 0; +unsigned long vm_paging_pages_mapped = 0; +unsigned long vm_paging_objects_mapped_slow = 0; +unsigned long vm_paging_pages_mapped_slow = 0; + +/* + * ENCRYPTED SWAP: + * vm_paging_map_object: + * Maps part of a VM object's pages in the kernel + * virtual address space, using the pre-allocated + * kernel virtual addresses, if possible. + * Context: + * The VM object is locked. This lock will get + * dropped and re-acquired though. + */ +kern_return_t +vm_paging_map_object( + vm_map_offset_t *address, + vm_page_t page, + vm_object_t object, + vm_object_offset_t offset, + vm_map_size_t *size) +{ + kern_return_t kr; + vm_map_offset_t page_map_offset; + vm_map_size_t map_size; + vm_object_offset_t object_offset; +#ifdef __ppc__ + int i; + vm_map_entry_t map_entry; +#endif /* __ppc__ */ + + +#ifdef __ppc__ + if (page != VM_PAGE_NULL && *size == PAGE_SIZE) { + /* + * Optimization for the PowerPC. + * Use one of the pre-allocated kernel virtual addresses + * and just enter the VM page in the kernel address space + * at that virtual address. + */ + vm_object_unlock(object); + simple_lock(&vm_paging_lock); + + if (vm_paging_base_address == 0) { + /* + * Initialize our pool of pre-allocated kernel + * virtual addresses. + */ + simple_unlock(&vm_paging_lock); + page_map_offset = 0; + kr = vm_map_find_space(kernel_map, + &page_map_offset, + VM_PAGING_NUM_PAGES * PAGE_SIZE, + 0, + &map_entry); + if (kr != KERN_SUCCESS) { + panic("vm_paging_map_object: " + "kernel_map full\n"); + } + map_entry->object.vm_object = kernel_object; + map_entry->offset = + page_map_offset - VM_MIN_KERNEL_ADDRESS; + vm_object_reference(kernel_object); + vm_map_unlock(kernel_map); + + simple_lock(&vm_paging_lock); + if (vm_paging_base_address != 0) { + /* someone raced us and won: undo */ + simple_unlock(&vm_paging_lock); + kr = vm_map_remove(kernel_map, + page_map_offset, + page_map_offset + + (VM_PAGING_NUM_PAGES + * PAGE_SIZE), + VM_MAP_NO_FLAGS); + assert(kr == KERN_SUCCESS); + simple_lock(&vm_paging_lock); + } else { + vm_paging_base_address = page_map_offset; + } + } + + /* + * Try and find an available kernel virtual address + * from our pre-allocated pool. + */ + page_map_offset = 0; + for (i = 0; i < VM_PAGING_NUM_PAGES; i++) { + if (vm_paging_page_inuse[i] == FALSE) { + page_map_offset = vm_paging_base_address + + (i * PAGE_SIZE); + break; + } + } + + if (page_map_offset != 0) { + /* + * We found a kernel virtual address; + * map the physical page to that virtual address. + */ + if (i > vm_paging_max_index) { + vm_paging_max_index = i; + } + vm_paging_page_inuse[i] = TRUE; + simple_unlock(&vm_paging_lock); + pmap_map_block(kernel_pmap, + page_map_offset, + page->phys_page, + PAGE_SIZE, + VM_PROT_DEFAULT, + ((int) page->object->wimg_bits & + VM_WIMG_MASK), + 0); + vm_paging_objects_mapped++; + vm_paging_pages_mapped++; + *address = page_map_offset; + vm_object_lock(object); + + /* all done and mapped, ready to use ! */ + return KERN_SUCCESS; + } + + /* + * We ran out of pre-allocated kernel virtual + * addresses. Just map the page in the kernel + * the slow and regular way. + */ + vm_paging_no_kernel_page++; + simple_unlock(&vm_paging_lock); + vm_object_lock(object); + } +#endif /* __ppc__ */ + + object_offset = vm_object_trunc_page(offset); + map_size = vm_map_round_page(*size); + + /* + * Try and map the required range of the object + * in the kernel_map + */ + + /* don't go beyond the object's end... */ + if (object_offset >= object->size) { + map_size = 0; + } else if (map_size > object->size - offset) { + map_size = object->size - offset; + } + + vm_object_reference_locked(object); /* for the map entry */ + vm_object_unlock(object); + + kr = vm_map_enter(kernel_map, + address, + map_size, + 0, + VM_FLAGS_ANYWHERE, + object, + object_offset, + FALSE, + VM_PROT_DEFAULT, + VM_PROT_ALL, + VM_INHERIT_NONE); + if (kr != KERN_SUCCESS) { + *address = 0; + *size = 0; + vm_object_deallocate(object); /* for the map entry */ + return kr; + } + + *size = map_size; + + /* + * Enter the mapped pages in the page table now. + */ + vm_object_lock(object); + for (page_map_offset = 0; + map_size != 0; + map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) { + unsigned int cache_attr; + + page = vm_page_lookup(object, offset + page_map_offset); + if (page == VM_PAGE_NULL) { + panic("vm_paging_map_object: no page !?"); + } + if (page->no_isync == TRUE) { + pmap_sync_page_data_phys(page->phys_page); + } + cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK; + + PMAP_ENTER(kernel_pmap, + *address + page_map_offset, + page, + VM_PROT_DEFAULT, + cache_attr, + FALSE); + } + + vm_paging_objects_mapped_slow++; + vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64; + + return KERN_SUCCESS; +} + +/* + * ENCRYPTED SWAP: + * vm_paging_unmap_object: + * Unmaps part of a VM object's pages from the kernel + * virtual address space. + * Context: + * The VM object is locked. This lock will get + * dropped and re-acquired though. + */ +void +vm_paging_unmap_object( + vm_object_t object, + vm_map_offset_t start, + vm_map_offset_t end) +{ + kern_return_t kr; +#ifdef __ppc__ + int i; +#endif /* __ppc__ */ + + if ((vm_paging_base_address != 0) && + ((start < vm_paging_base_address) || + (end > (vm_paging_base_address + + (VM_PAGING_NUM_PAGES * PAGE_SIZE))))) { + /* + * We didn't use our pre-allocated pool of + * kernel virtual address. Deallocate the + * virtual memory. + */ + if (object != VM_OBJECT_NULL) { + vm_object_unlock(object); + } + kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS); + if (object != VM_OBJECT_NULL) { + vm_object_lock(object); + } + assert(kr == KERN_SUCCESS); + } else { + /* + * We used a kernel virtual address from our + * pre-allocated pool. Put it back in the pool + * for next time. + */ +#ifdef __ppc__ + assert(end - start == PAGE_SIZE); + i = (start - vm_paging_base_address) >> PAGE_SHIFT; + + /* undo the pmap mapping */ + mapping_remove(kernel_pmap, start); + + simple_lock(&vm_paging_lock); + vm_paging_page_inuse[i] = FALSE; + simple_unlock(&vm_paging_lock); +#endif /* __ppc__ */ + } +} + +/* + * Encryption data. + * "iv" is the "initial vector". Ideally, we want to + * have a different one for each page we encrypt, so that + * crackers can't find encryption patterns too easily. + */ +#define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */ +boolean_t swap_crypt_ctx_initialized = FALSE; +aes_32t swap_crypt_key[8]; /* big enough for a 256 key */ +aes_ctx swap_crypt_ctx; +const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, }; + +#if DEBUG +boolean_t swap_crypt_ctx_tested = FALSE; +unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096))); +unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096))); +unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096))); +#endif /* DEBUG */ + +extern u_long random(void); + +/* + * Initialize the encryption context: key and key size. + */ +void swap_crypt_ctx_initialize(void); /* forward */ +void +swap_crypt_ctx_initialize(void) +{ + unsigned int i; + + /* + * No need for locking to protect swap_crypt_ctx_initialized + * because the first use of encryption will come from the + * pageout thread (we won't pagein before there's been a pageout) + * and there's only one pageout thread. + */ + if (swap_crypt_ctx_initialized == FALSE) { + for (i = 0; + i < (sizeof (swap_crypt_key) / + sizeof (swap_crypt_key[0])); + i++) { + swap_crypt_key[i] = random(); + } + aes_encrypt_key((const unsigned char *) swap_crypt_key, + SWAP_CRYPT_AES_KEY_SIZE, + &swap_crypt_ctx.encrypt); + aes_decrypt_key((const unsigned char *) swap_crypt_key, + SWAP_CRYPT_AES_KEY_SIZE, + &swap_crypt_ctx.decrypt); + swap_crypt_ctx_initialized = TRUE; + } + +#if DEBUG + /* + * Validate the encryption algorithms. + */ + if (swap_crypt_ctx_tested == FALSE) { + /* initialize */ + for (i = 0; i < 4096; i++) { + swap_crypt_test_page_ref[i] = (char) i; + } + /* encrypt */ + aes_encrypt_cbc(swap_crypt_test_page_ref, + swap_crypt_null_iv, + PAGE_SIZE / AES_BLOCK_SIZE, + swap_crypt_test_page_encrypt, + &swap_crypt_ctx.encrypt); + /* decrypt */ + aes_decrypt_cbc(swap_crypt_test_page_encrypt, + swap_crypt_null_iv, + PAGE_SIZE / AES_BLOCK_SIZE, + swap_crypt_test_page_decrypt, + &swap_crypt_ctx.decrypt); + /* compare result with original */ + for (i = 0; i < 4096; i ++) { + if (swap_crypt_test_page_decrypt[i] != + swap_crypt_test_page_ref[i]) { + panic("encryption test failed"); + } + } + + /* encrypt again */ + aes_encrypt_cbc(swap_crypt_test_page_decrypt, + swap_crypt_null_iv, + PAGE_SIZE / AES_BLOCK_SIZE, + swap_crypt_test_page_decrypt, + &swap_crypt_ctx.encrypt); + /* decrypt in place */ + aes_decrypt_cbc(swap_crypt_test_page_decrypt, + swap_crypt_null_iv, + PAGE_SIZE / AES_BLOCK_SIZE, + swap_crypt_test_page_decrypt, + &swap_crypt_ctx.decrypt); + for (i = 0; i < 4096; i ++) { + if (swap_crypt_test_page_decrypt[i] != + swap_crypt_test_page_ref[i]) { + panic("in place encryption test failed"); + } + } + + swap_crypt_ctx_tested = TRUE; + } +#endif /* DEBUG */ +} + +/* + * ENCRYPTED SWAP: + * vm_page_encrypt: + * Encrypt the given page, for secure paging. + * The page might already be mapped at kernel virtual + * address "kernel_mapping_offset". Otherwise, we need + * to map it. + * + * Context: + * The page's object is locked, but this lock will be released + * and re-acquired. + * The page is busy and not accessible by users (not entered in any pmap). + */ +void +vm_page_encrypt( + vm_page_t page, + vm_map_offset_t kernel_mapping_offset) +{ + int clear_refmod = 0; + kern_return_t kr; + boolean_t page_was_referenced; + boolean_t page_was_modified; + vm_map_size_t kernel_mapping_size; + vm_offset_t kernel_vaddr; + union { + unsigned char aes_iv[AES_BLOCK_SIZE]; + struct { + memory_object_t pager_object; + vm_object_offset_t paging_offset; + } vm; + } encrypt_iv; + + if (! vm_pages_encrypted) { + vm_pages_encrypted = TRUE; + } + + assert(page->busy); + assert(page->dirty || page->precious); + + if (page->encrypted) { + /* + * Already encrypted: no need to do it again. + */ + vm_page_encrypt_already_encrypted_counter++; + return; + } + ASSERT_PAGE_DECRYPTED(page); + + /* + * Gather the "reference" and "modified" status of the page. + * We'll restore these values after the encryption, so that + * the encryption is transparent to the rest of the system + * and doesn't impact the VM's LRU logic. + */ + page_was_referenced = + (page->reference || pmap_is_referenced(page->phys_page)); + page_was_modified = + (page->dirty || pmap_is_modified(page->phys_page)); + + if (kernel_mapping_offset == 0) { + /* + * The page hasn't already been mapped in kernel space + * by the caller. Map it now, so that we can access + * its contents and encrypt them. + */ + kernel_mapping_size = PAGE_SIZE; + kr = vm_paging_map_object(&kernel_mapping_offset, + page, + page->object, + page->offset, + &kernel_mapping_size); + if (kr != KERN_SUCCESS) { + panic("vm_page_encrypt: " + "could not map page in kernel: 0x%x\n", + kr); + } + } else { + kernel_mapping_size = 0; + } + kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset); + + if (swap_crypt_ctx_initialized == FALSE) { + swap_crypt_ctx_initialize(); + } + assert(swap_crypt_ctx_initialized); + + /* + * Prepare an "initial vector" for the encryption. + * We use the "pager" and the "paging_offset" for that + * page to obfuscate the encrypted data a bit more and + * prevent crackers from finding patterns that they could + * use to break the key. + */ + bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv)); + encrypt_iv.vm.pager_object = page->object->pager; + encrypt_iv.vm.paging_offset = + page->object->paging_offset + page->offset; + + vm_object_unlock(page->object); + + /* encrypt the "initial vector" */ + aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0], + swap_crypt_null_iv, + 1, + &encrypt_iv.aes_iv[0], + &swap_crypt_ctx.encrypt); + + /* + * Encrypt the page. + */ + aes_encrypt_cbc((const unsigned char *) kernel_vaddr, + &encrypt_iv.aes_iv[0], + PAGE_SIZE / AES_BLOCK_SIZE, + (unsigned char *) kernel_vaddr, + &swap_crypt_ctx.encrypt); + + vm_page_encrypt_counter++; + + vm_object_lock(page->object); + + /* + * Unmap the page from the kernel's address space, + * if we had to map it ourselves. Otherwise, let + * the caller undo the mapping if needed. + */ + if (kernel_mapping_size != 0) { + vm_paging_unmap_object(page->object, + kernel_mapping_offset, + kernel_mapping_offset + kernel_mapping_size); } - vm_object_lock(object); - if (!object->phys_contiguous) { - /* Protect user space from future COW operations */ - object->true_share = TRUE; - if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) - object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; + /* + * Restore the "reference" and "modified" bits. + * This should clean up any impact the encryption had + * on them. + */ + if (! page_was_referenced) { + clear_refmod |= VM_MEM_REFERENCED; + page->reference = FALSE; + } + if (! page_was_modified) { + clear_refmod |= VM_MEM_MODIFIED; + page->dirty = FALSE; } + if (clear_refmod) + pmap_clear_refmod(page->phys_page, clear_refmod); -#ifdef UBC_DEBUG - if(upl_ptr) - queue_enter(&object->uplq, upl, upl_t, uplq); -#endif /* UBC_DEBUG */ - vm_object_paging_begin(object); - entry = 0; - while (xfer_size) { - if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) { - vm_object_unlock(object); - VM_PAGE_GRAB_FICTITIOUS(alias_page); - vm_object_lock(object); + page->encrypted = TRUE; +} + +/* + * ENCRYPTED SWAP: + * vm_page_decrypt: + * Decrypt the given page. + * The page might already be mapped at kernel virtual + * address "kernel_mapping_offset". Otherwise, we need + * to map it. + * + * Context: + * The page's VM object is locked but will be unlocked and relocked. + * The page is busy and not accessible by users (not entered in any pmap). + */ +void +vm_page_decrypt( + vm_page_t page, + vm_map_offset_t kernel_mapping_offset) +{ + int clear_refmod = 0; + kern_return_t kr; + vm_map_size_t kernel_mapping_size; + vm_offset_t kernel_vaddr; + boolean_t page_was_referenced; + union { + unsigned char aes_iv[AES_BLOCK_SIZE]; + struct { + memory_object_t pager_object; + vm_object_offset_t paging_offset; + } vm; + } decrypt_iv; + + assert(page->busy); + assert(page->encrypted); + + /* + * Gather the "reference" status of the page. + * We'll restore its value after the decryption, so that + * the decryption is transparent to the rest of the system + * and doesn't impact the VM's LRU logic. + */ + page_was_referenced = + (page->reference || pmap_is_referenced(page->phys_page)); + + if (kernel_mapping_offset == 0) { + /* + * The page hasn't already been mapped in kernel space + * by the caller. Map it now, so that we can access + * its contents and decrypt them. + */ + kernel_mapping_size = PAGE_SIZE; + kr = vm_paging_map_object(&kernel_mapping_offset, + page, + page->object, + page->offset, + &kernel_mapping_size); + if (kr != KERN_SUCCESS) { + panic("vm_page_decrypt: " + "could not map page in kernel: 0x%x\n"); } - dst_page = vm_page_lookup(object, dst_offset); - if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) || - (dst_page->unusual && (dst_page->error || - dst_page->restart || dst_page->absent || - dst_page->fictitious || - prot & dst_page->page_lock))) { - vm_fault_return_t result; - do { - vm_page_t top_page; - kern_return_t error_code; - int interruptible; + } else { + kernel_mapping_size = 0; + } + kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset); - vm_object_offset_t lo_offset = offset; - vm_object_offset_t hi_offset = offset + size; + assert(swap_crypt_ctx_initialized); + /* + * Prepare an "initial vector" for the decryption. + * It has to be the same as the "initial vector" we + * used to encrypt that page. + */ + bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv)); + decrypt_iv.vm.pager_object = page->object->pager; + decrypt_iv.vm.paging_offset = + page->object->paging_offset + page->offset; + vm_object_unlock(page->object); - if(cntrl_flags & UPL_SET_INTERRUPTIBLE) { - interruptible = THREAD_ABORTSAFE; - } else { - interruptible = THREAD_UNINT; - } + /* encrypt the "initial vector" */ + aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0], + swap_crypt_null_iv, + 1, + &decrypt_iv.aes_iv[0], + &swap_crypt_ctx.encrypt); - result = vm_fault_page(object, dst_offset, - prot | VM_PROT_WRITE, FALSE, - interruptible, - lo_offset, hi_offset, - VM_BEHAVIOR_SEQUENTIAL, - &prot, &dst_page, &top_page, - (int *)0, - &error_code, FALSE, FALSE, NULL, 0); + /* + * Decrypt the page. + */ + aes_decrypt_cbc((const unsigned char *) kernel_vaddr, + &decrypt_iv.aes_iv[0], + PAGE_SIZE / AES_BLOCK_SIZE, + (unsigned char *) kernel_vaddr, + &swap_crypt_ctx.decrypt); + vm_page_decrypt_counter++; - switch(result) { - case VM_FAULT_SUCCESS: + vm_object_lock(page->object); - PAGE_WAKEUP_DONE(dst_page); + /* + * Unmap the page from the kernel's address space, + * if we had to map it ourselves. Otherwise, let + * the caller undo the mapping if needed. + */ + if (kernel_mapping_size != 0) { + vm_paging_unmap_object(page->object, + kernel_vaddr, + kernel_vaddr + PAGE_SIZE); + } - /* - * Release paging references and - * top-level placeholder page, if any. - */ + /* + * After decryption, the page is actually clean. + * It was encrypted as part of paging, which "cleans" + * the "dirty" pages. + * Noone could access it after it was encrypted + * and the decryption doesn't count. + */ + page->dirty = FALSE; + clear_refmod = VM_MEM_MODIFIED; - if(top_page != VM_PAGE_NULL) { - vm_object_t local_object; - local_object = - top_page->object; - if(top_page->object - != dst_page->object) { - vm_object_lock( - local_object); - VM_PAGE_FREE(top_page); - vm_object_paging_end( - local_object); - vm_object_unlock( - local_object); - } else { - VM_PAGE_FREE(top_page); - vm_object_paging_end( - local_object); - } - } + /* restore the "reference" bit */ + if (! page_was_referenced) { + page->reference = FALSE; + clear_refmod |= VM_MEM_REFERENCED; + } + pmap_clear_refmod(page->phys_page, clear_refmod); - break; - - - case VM_FAULT_RETRY: - vm_object_lock(object); - vm_object_paging_begin(object); - break; + page->encrypted = FALSE; - case VM_FAULT_FICTITIOUS_SHORTAGE: - vm_page_more_fictitious(); - vm_object_lock(object); - vm_object_paging_begin(object); - break; + /* + * We've just modified the page's contents via the data cache and part + * of the new contents might still be in the cache and not yet in RAM. + * Since the page is now available and might get gathered in a UPL to + * be part of a DMA transfer from a driver that expects the memory to + * be coherent at this point, we have to flush the data cache. + */ + pmap_sync_page_data_phys(page->phys_page); + /* + * Since the page is not mapped yet, some code might assume that it + * doesn't need to invalidate the instruction cache when writing to + * that page. That code relies on "no_isync" being set, so that the + * caches get syncrhonized when the page is first mapped. So we need + * to set "no_isync" here too, despite the fact that we just + * synchronized the caches above... + */ + page->no_isync = TRUE; +} - case VM_FAULT_MEMORY_SHORTAGE: - if (vm_page_wait(interruptible)) { - vm_object_lock(object); - vm_object_paging_begin(object); - break; - } - /* fall thru */ +unsigned long upl_encrypt_upls = 0; +unsigned long upl_encrypt_pages = 0; - case VM_FAULT_INTERRUPTED: - error_code = MACH_SEND_INTERRUPTED; - case VM_FAULT_MEMORY_ERROR: - ret = (error_code ? error_code: - KERN_MEMORY_ERROR); - vm_object_lock(object); - for(; offset < dst_offset; - offset += PAGE_SIZE) { - dst_page = vm_page_lookup( - object, offset); - if(dst_page == VM_PAGE_NULL) - panic("vm_object_iopl_request: Wired pages missing. \n"); - vm_page_lock_queues(); - vm_page_unwire(dst_page); - vm_page_unlock_queues(); - VM_STAT(reactivations++); - } - vm_object_unlock(object); - upl_destroy(upl); - return ret; - } - } while ((result != VM_FAULT_SUCCESS) - || (result == VM_FAULT_INTERRUPTED)); - } +/* + * ENCRYPTED SWAP: + * + * upl_encrypt: + * Encrypts all the pages in the UPL, within the specified range. + * + */ +void +upl_encrypt( + upl_t upl, + upl_offset_t crypt_offset, + upl_size_t crypt_size) +{ + upl_size_t upl_size; + upl_offset_t upl_offset; + vm_object_t upl_object; + vm_page_t page; + vm_object_t shadow_object; + vm_object_offset_t shadow_offset; + vm_object_offset_t paging_offset; + vm_object_offset_t base_offset; - vm_page_lock_queues(); - vm_page_wire(dst_page); - vm_page_unlock_queues(); + upl_encrypt_upls++; + upl_encrypt_pages += crypt_size / PAGE_SIZE; - if(upl_ptr) { + upl_lock(upl); - vm_page_lock_queues(); - if(cntrl_flags & UPL_SET_LITE) { - int pg_num; - pg_num = (dst_offset-offset)/PAGE_SIZE; - lite_list[pg_num>>5] |= 1 << (pg_num & 31); - } else { - /* - * Convert the fictitious page to a - * private shadow of the real page. - */ - assert(alias_page->fictitious); - alias_page->fictitious = FALSE; - alias_page->private = TRUE; - alias_page->pageout = TRUE; - alias_page->phys_page = dst_page->phys_page; - vm_page_wire(alias_page); + upl_object = upl->map_object; + upl_offset = upl->offset; + upl_size = upl->size; - vm_page_insert(alias_page, - upl->map_object, size - xfer_size); - assert(!alias_page->wanted); - alias_page->busy = FALSE; - alias_page->absent = FALSE; - } + upl_unlock(upl); - /* expect the page to be used */ - dst_page->reference = TRUE; - if (!(cntrl_flags & UPL_COPYOUT_FROM)) - dst_page->dirty = TRUE; - alias_page = NULL; + vm_object_lock(upl_object); - if(user_page_list) { - user_page_list[entry].phys_addr - = dst_page->phys_page; - user_page_list[entry].dirty = - dst_page->dirty; - user_page_list[entry].pageout = - dst_page->pageout; - user_page_list[entry].absent = - dst_page->absent; - user_page_list[entry].precious = - dst_page->precious; - } - vm_page_unlock_queues(); - } - entry++; - dst_offset += PAGE_SIZE_64; - xfer_size -= PAGE_SIZE; + /* + * Find the VM object that contains the actual pages. + */ + if (upl_object->pageout) { + shadow_object = upl_object->shadow; + /* + * The offset in the shadow object is actually also + * accounted for in upl->offset. It possibly shouldn't be + * this way, but for now don't account for it twice. + */ + shadow_offset = 0; + assert(upl_object->paging_offset == 0); /* XXX ? */ + vm_object_lock(shadow_object); + } else { + shadow_object = upl_object; + shadow_offset = 0; } - if (upl->flags & UPL_INTERNAL) { - if(page_list_count != NULL) - *page_list_count = 0; - } else if (*page_list_count > entry) { - if(page_list_count != NULL) - *page_list_count = entry; + paging_offset = shadow_object->paging_offset; + vm_object_paging_begin(shadow_object); + + if (shadow_object != upl_object) { + vm_object_unlock(shadow_object); } + vm_object_unlock(upl_object); - if(alias_page != NULL) { - vm_page_lock_queues(); - vm_page_free(alias_page); - vm_page_unlock_queues(); + base_offset = shadow_offset; + base_offset += upl_offset; + base_offset += crypt_offset; + base_offset -= paging_offset; + /* + * Unmap the pages, so that nobody can continue accessing them while + * they're encrypted. After that point, all accesses to these pages + * will cause a page fault and block while the page is being encrypted + * (busy). After the encryption completes, any access will cause a + * page fault and the page gets decrypted at that time. + */ + assert(crypt_offset + crypt_size <= upl_size); + vm_object_pmap_protect(shadow_object, + base_offset, + (vm_object_size_t)crypt_size, + PMAP_NULL, + 0, + VM_PROT_NONE); + + /* XXX FBDP could the object have changed significantly here ? */ + vm_object_lock(shadow_object); + + for (upl_offset = 0; + upl_offset < crypt_size; + upl_offset += PAGE_SIZE) { + page = vm_page_lookup(shadow_object, + base_offset + upl_offset); + if (page == VM_PAGE_NULL) { + panic("upl_encrypt: " + "no page for (obj=%p,off=%lld+%d)!\n", + shadow_object, + base_offset, + upl_offset); + } + vm_page_encrypt(page, 0); } - vm_object_unlock(object); - return KERN_SUCCESS; + vm_object_paging_end(shadow_object); + vm_object_unlock(shadow_object); } + vm_size_t -upl_get_internal_pagelist_offset() +upl_get_internal_pagelist_offset(void) { return sizeof(struct upl); } @@ -4177,9 +5997,9 @@ boolean_t upl_valid_page(upl_page_info_t *upl, int index) { return(UPL_VALID_PAGE(upl, index)); } -vm_offset_t upl_phys_page(upl_page_info_t *upl, int index) +ppnum_t upl_phys_page(upl_page_info_t *upl, int index) { - return((vm_offset_t)UPL_PHYS_PAGE(upl, index)); + return(UPL_PHYS_PAGE(upl, index)); } void @@ -4204,6 +6024,7 @@ vm_countdirtypages(void) if(m->pageout) pgopages++; if(m->precious) precpages++; + assert(m->object != kernel_object); m = (vm_page_t) queue_next(&m->pageq); if (m ==(vm_page_t )0) break; @@ -4219,6 +6040,7 @@ vm_countdirtypages(void) if(m->pageout) pgopages++; if(m->precious) precpages++; + assert(m->object != kernel_object); m = (vm_page_t) queue_next(&m->pageq); if (m ==(vm_page_t )0) break; @@ -4240,6 +6062,7 @@ vm_countdirtypages(void) if(m->pageout) pgopages++; if(m->precious) precpages++; + assert(m->object != kernel_object); m = (vm_page_t) queue_next(&m->pageq); if(m == (vm_page_t )0) break; @@ -4251,7 +6074,7 @@ vm_countdirtypages(void) } #endif /* MACH_BSD */ -#ifdef UBC_DEBUG +#ifdef UPL_DEBUG kern_return_t upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2) { upl->ubc_alias1 = alias1; @@ -4266,7 +6089,7 @@ int upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2) *al2 = upl->ubc_alias2; return KERN_SUCCESS; } -#endif /* UBC_DEBUG */ +#endif /* UPL_DEBUG */ @@ -4276,13 +6099,11 @@ int upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2) #include #define printf kdbprintf -extern int db_indent; void db_pageout(void); void db_vm(void) { - extern int vm_page_gobble_count; iprintf("VM Statistics:\n"); db_indent += 2; @@ -4293,8 +6114,6 @@ db_vm(void) vm_page_free_count); printf(" wire %5d gobbl %5d\n", vm_page_wire_count, vm_page_gobble_count); - iprintf("laund %5d\n", - vm_page_laundry_count); db_indent -= 2; iprintf("target:\n"); db_indent += 2; @@ -4303,32 +6122,18 @@ db_vm(void) vm_page_free_target); printf(" resrv %5d\n", vm_page_free_reserved); db_indent -= 2; - - iprintf("burst:\n"); - db_indent += 2; - iprintf("max %5d min %5d wait %5d empty %5d\n", - vm_pageout_burst_max, vm_pageout_burst_min, - vm_pageout_burst_wait, vm_pageout_empty_wait); - db_indent -= 2; iprintf("pause:\n"); - db_indent += 2; - iprintf("count %5d max %5d\n", - vm_pageout_pause_count, vm_pageout_pause_max); -#if MACH_COUNTERS - iprintf("scan_continue called %8d\n", c_vm_pageout_scan_continue); -#endif /* MACH_COUNTERS */ - db_indent -= 2; db_pageout(); db_indent -= 2; } -void -db_pageout(void) -{ #if MACH_COUNTERS - extern int c_laundry_pages_freed; +extern int c_laundry_pages_freed; #endif /* MACH_COUNTERS */ +void +db_pageout(void) +{ iprintf("Pageout Statistics:\n"); db_indent += 2; iprintf("active %5d inactv %5d\n", @@ -4361,18 +6166,4 @@ db_pageout(void) db_indent -= 2; } -#if MACH_CLUSTER_STATS -unsigned long vm_pageout_cluster_dirtied = 0; -unsigned long vm_pageout_cluster_cleaned = 0; -unsigned long vm_pageout_cluster_collisions = 0; -unsigned long vm_pageout_cluster_clusters = 0; -unsigned long vm_pageout_cluster_conversions = 0; -unsigned long vm_pageout_target_collisions = 0; -unsigned long vm_pageout_target_page_dirtied = 0; -unsigned long vm_pageout_target_page_freed = 0; -#define CLUSTER_STAT(clause) clause -#else /* MACH_CLUSTER_STATS */ -#define CLUSTER_STAT(clause) -#endif /* MACH_CLUSTER_STATS */ - #endif /* MACH_KDB */