X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/de355530ae67247cbd0da700edb3a2a1dae884c2..fe8ab488e9161c46dd9885d58fc52996dc0249ff:/osfmk/default_pager/dp_backing_store.c diff --git a/osfmk/default_pager/dp_backing_store.c b/osfmk/default_pager/dp_backing_store.c index 112ced6c6..205e612a2 100644 --- a/osfmk/default_pager/dp_backing_store.c +++ b/osfmk/default_pager/dp_backing_store.c @@ -1,24 +1,29 @@ - /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ @@ -54,19 +59,32 @@ * Paging File Management. */ +#include #include #include -#include "default_pager_internal.h" +#include +#include #include +#include + +#include #include #include + +#include +#include #include #include #include + #include #include -/* CDY CDY */ #include +#include +#include + + +/* todo - need large internal object support */ /* * ALLOC_STRIDE... the maximum number of bytes allocated from @@ -83,16 +101,19 @@ int physical_transfer_cluster_count = 0; #define VM_SUPER_CLUSTER 0x40000 -#define VM_SUPER_PAGES 64 +#define VM_SUPER_PAGES (VM_SUPER_CLUSTER / PAGE_MIN_SIZE) /* * 0 means no shift to pages, so == 1 page/cluster. 1 would mean * 2 pages/cluster, 2 means 4 pages/cluster, and so on. */ +#define VSTRUCT_MIN_CLSHIFT 0 + #define VSTRUCT_DEF_CLSHIFT 2 -int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT; int default_pager_clsize = 0; +int vstruct_def_clshift = VSTRUCT_DEF_CLSHIFT; + /* statistics */ unsigned int clustered_writes[VM_SUPER_PAGES+1]; unsigned int clustered_reads[VM_SUPER_PAGES+1]; @@ -115,7 +136,7 @@ int async_requests_out; #define VS_ASYNC_REUSE 1 struct vs_async *vs_async_free_list; -mutex_t default_pager_async_lock; /* Protects globals above */ +lck_mtx_t default_pager_async_lock; /* Protects globals above */ int vs_alloc_async_failed = 0; /* statistics */ @@ -127,25 +148,38 @@ void vs_free_async(struct vs_async *vsa); /* forward */ #define VS_ALLOC_ASYNC() vs_alloc_async() #define VS_FREE_ASYNC(vsa) vs_free_async(vsa) -#define VS_ASYNC_LOCK() mutex_lock(&default_pager_async_lock) -#define VS_ASYNC_UNLOCK() mutex_unlock(&default_pager_async_lock) -#define VS_ASYNC_LOCK_INIT() mutex_init(&default_pager_async_lock, \ - ETAP_IO_DEV_PAGEH) +#define VS_ASYNC_LOCK() lck_mtx_lock(&default_pager_async_lock) +#define VS_ASYNC_UNLOCK() lck_mtx_unlock(&default_pager_async_lock) +#define VS_ASYNC_LOCK_INIT() lck_mtx_init(&default_pager_async_lock, &default_pager_lck_grp, &default_pager_lck_attr) +#define VS_ASYNC_LOCK_DESTROY() lck_mtx_destroy(&default_pager_async_lock, &default_pager_lck_grp) #define VS_ASYNC_LOCK_ADDR() (&default_pager_async_lock) /* * Paging Space Hysteresis triggers and the target notification port * */ - +unsigned int dp_pages_free_drift_count = 0; +unsigned int dp_pages_free_drifted_max = 0; unsigned int minimum_pages_remaining = 0; unsigned int maximum_pages_free = 0; ipc_port_t min_pages_trigger_port = NULL; ipc_port_t max_pages_trigger_port = NULL; +#if CONFIG_FREEZE +boolean_t use_emergency_swap_file_first = TRUE; +#else +boolean_t use_emergency_swap_file_first = FALSE; +#endif boolean_t bs_low = FALSE; int backing_store_release_trigger_disable = 0; - +boolean_t backing_store_stop_compaction = FALSE; +boolean_t backing_store_abort_compaction = FALSE; +/* Have we decided if swap needs to be encrypted yet ? */ +boolean_t dp_encryption_inited = FALSE; +/* Should we encrypt swap ? */ +boolean_t dp_encryption = FALSE; + +boolean_t dp_isssd = FALSE; /* * Object sizes are rounded up to the next power of 2, @@ -156,9 +190,10 @@ vm_size_t max_doubled_size = 4 * 1024 * 1024; /* 4 meg */ /* * List of all backing store and segments. */ +MACH_PORT_FACE emergency_segment_backing_store; struct backing_store_list_head backing_store_list; paging_segment_t paging_segments[MAX_NUM_PAGING_SEGMENTS]; -mutex_t paging_segments_lock; +lck_mtx_t paging_segments_lock; int paging_segment_max = 0; int paging_segment_count = 0; int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 }; @@ -171,14 +206,46 @@ int ps_select_array[BS_MAXPRI+1] = { -1,-1,-1,-1,-1 }; * likely to be deprecated. */ unsigned int dp_pages_free = 0; +unsigned int dp_pages_reserve = 0; unsigned int cluster_transfer_minimum = 100; -kern_return_t ps_write_file(paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, int); /* forward */ -kern_return_t ps_read_file (paging_segment_t, upl_t, vm_offset_t, vm_offset_t, unsigned int, unsigned int *, int); /* forward */ +/* + * Trim state + */ +struct ps_vnode_trim_data { + struct vnode *vp; + dp_offset_t offset; + dp_size_t length; +}; + +/* forward declarations */ +kern_return_t ps_write_file(paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, int); /* forward */ +kern_return_t ps_read_file (paging_segment_t, upl_t, upl_offset_t, dp_offset_t, unsigned int, unsigned int *, int); /* forward */ +default_pager_thread_t *get_read_buffer( void ); +kern_return_t ps_vstruct_transfer_from_segment( + vstruct_t vs, + paging_segment_t segment, + upl_t upl); +kern_return_t ps_read_device(paging_segment_t, dp_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */ +kern_return_t ps_write_device(paging_segment_t, dp_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */ +kern_return_t vs_cluster_transfer( + vstruct_t vs, + dp_offset_t offset, + dp_size_t cnt, + upl_t upl); +vs_map_t vs_get_map_entry( + vstruct_t vs, + dp_offset_t offset); + +kern_return_t +default_pager_backing_store_delete_internal( MACH_PORT_FACE ); +static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data); +static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data); +static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length); default_pager_thread_t * -get_read_buffer() +get_read_buffer( void ) { int i; @@ -311,10 +378,10 @@ int default_pager_info_verbose = 1; void bs_global_info( - vm_size_t *totalp, - vm_size_t *freep) + uint64_t *totalp, + uint64_t *freep) { - vm_size_t pages_total, pages_free; + uint64_t pages_total, pages_free; paging_segment_t ps; int i; @@ -332,9 +399,9 @@ bs_global_info( */ pages_total += ps->ps_pgnum; pages_free += ps->ps_clcount << ps->ps_clshift; - DEBUG(DEBUG_BS_INTERNAL, - ("segment #%d: %d total, %d free\n", - i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift)); + DP_DEBUG(DEBUG_BS_INTERNAL, + ("segment #%d: %d total, %d free\n", + i, ps->ps_pgnum, ps->ps_clcount << ps->ps_clshift)); } *totalp = pages_total; *freep = pages_free; @@ -391,7 +458,7 @@ backing_store_lookup( if ((port == MACH_PORT_NULL) || port_is_vs(port)) */ - if ((port == MACH_PORT_NULL)) + if (port == MACH_PORT_NULL) return BACKING_STORE_NULL; BSL_LOCK(); @@ -413,10 +480,10 @@ void backing_store_add(backing_store_t); /* forward */ void backing_store_add( - backing_store_t bs) + __unused backing_store_t bs) { - MACH_PORT_FACE port = bs->bs_port; - MACH_PORT_FACE pset = default_pager_default_set; +// MACH_PORT_FACE port = bs->bs_port; +// MACH_PORT_FACE pset = default_pager_default_set; kern_return_t kr = KERN_SUCCESS; if (kr != KERN_SUCCESS) @@ -513,7 +580,7 @@ default_pager_backing_store_create( { backing_store_t bs; MACH_PORT_FACE port; - kern_return_t kr; +// kern_return_t kr; struct vstruct_alias *alias_struct; if (pager != default_pager_object) @@ -524,20 +591,23 @@ default_pager_backing_store_create( ipc_port_make_send(port); assert (port != IP_NULL); - DEBUG(DEBUG_BS_EXTERNAL, - ("priority=%d clsize=%d bs_port=0x%x\n", - priority, clsize, (int) backing_store)); + DP_DEBUG(DEBUG_BS_EXTERNAL, + ("priority=%d clsize=%d bs_port=0x%x\n", + priority, clsize, (int) backing_store)); alias_struct = (struct vstruct_alias *) kalloc(sizeof (struct vstruct_alias)); if(alias_struct != NULL) { alias_struct->vs = (struct vstruct *)bs; - alias_struct->name = ISVS; - port->alias = (int) alias_struct; + alias_struct->name = &default_pager_ops; + port->ip_alias = (uintptr_t) alias_struct; } else { ipc_port_dealloc_kernel((MACH_PORT_FACE)(port)); - kfree((vm_offset_t)bs, sizeof (struct backing_store)); + + BS_LOCK_DESTROY(bs); + kfree(bs, sizeof (struct backing_store)); + return KERN_RESOURCE_SHORTAGE; } @@ -550,7 +620,7 @@ default_pager_backing_store_create( priority = BS_MINPRI; bs->bs_priority = priority; - bs->bs_clsize = bs_get_global_clsize(atop(clsize)); + bs->bs_clsize = bs_get_global_clsize(atop_32(clsize)); BSL_LOCK(); queue_enter(&backing_store_list.bsl_queue, bs, backing_store_t, @@ -614,7 +684,7 @@ default_pager_backing_store_info( basic->bs_pages_out_fail= bs->bs_pages_out_fail; basic->bs_priority = bs->bs_priority; - basic->bs_clsize = ptoa(bs->bs_clsize); /* in bytes */ + basic->bs_clsize = ptoa_32(bs->bs_clsize); /* in bytes */ BS_UNLOCK(bs); @@ -622,6 +692,7 @@ default_pager_backing_store_info( } int ps_delete(paging_segment_t); /* forward */ +boolean_t current_thread_aborted(void); int ps_delete( @@ -663,6 +734,10 @@ ps_delete( if ((vs_count != 0) && (vs != NULL)) vs->vs_async_pending += 1; /* hold parties calling */ /* vs_async_wait */ + + if (bs_low == FALSE) + backing_store_abort_compaction = FALSE; + VS_UNLOCK(vs); VSL_UNLOCK(); while((vs_count != 0) && (vs != NULL)) { @@ -683,27 +758,33 @@ ps_delete( error = KERN_FAILURE; else { vm_object_t transfer_object; - int count; + unsigned int count; upl_t upl; + int upl_flags; - transfer_object = vm_object_allocate(VM_SUPER_CLUSTER); + transfer_object = vm_object_allocate((vm_object_size_t)VM_SUPER_CLUSTER); count = 0; + upl_flags = (UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | + UPL_SET_LITE | UPL_SET_INTERNAL); + if (dp_encryption) { + /* mark the pages as "encrypted" when they come in */ + upl_flags |= UPL_ENCRYPT; + } error = vm_object_upl_request(transfer_object, (vm_object_offset_t)0, VM_SUPER_CLUSTER, - &upl, NULL, &count, - UPL_NO_SYNC | UPL_CLEAN_IN_PLACE - | UPL_SET_INTERNAL); + &upl, NULL, &count, upl_flags); + if(error == KERN_SUCCESS) { error = ps_vstruct_transfer_from_segment( vs, ps, upl); - upl_commit(upl, NULL); + upl_commit(upl, NULL, 0); upl_deallocate(upl); } else { error = KERN_FAILURE; } vm_object_deallocate(transfer_object); } - if(error) { + if(error || current_thread_aborted()) { VS_LOCK(vs); vs->vs_async_pending -= 1; /* release vs_async_wait */ if (vs->vs_async_pending == 0 && vs->vs_waiting_async) { @@ -754,7 +835,7 @@ ps_delete( kern_return_t -default_pager_backing_store_delete( +default_pager_backing_store_delete_internal( MACH_PORT_FACE backing_store) { backing_store_t bs; @@ -762,28 +843,35 @@ default_pager_backing_store_delete( paging_segment_t ps; int error; int interim_pages_removed = 0; - kern_return_t kr; + boolean_t dealing_with_emergency_segment = ( backing_store == emergency_segment_backing_store ); if ((bs = backing_store_lookup(backing_store)) == BACKING_STORE_NULL) return KERN_INVALID_ARGUMENT; -#if 0 - /* not implemented */ - BS_UNLOCK(bs); - return KERN_FAILURE; -#endif - - restart: +restart: PSL_LOCK(); error = KERN_SUCCESS; for (i = 0; i <= paging_segment_max; i++) { ps = paging_segments[i]; if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs && - ! ps->ps_going_away) { + ! IS_PS_GOING_AWAY(ps)) { PS_LOCK(ps); + + if( IS_PS_GOING_AWAY(ps) || !IS_PS_OK_TO_USE(ps)) { + /* + * Someone is already busy reclamining this paging segment. + * If it's the emergency segment we are looking at then check + * that someone has not already recovered it and set the right + * state i.e. online but not activated. + */ + PS_UNLOCK(ps); + continue; + } + /* disable access to this segment */ - ps->ps_going_away = TRUE; + ps->ps_state &= ~PS_CAN_USE; + ps->ps_state |= PS_GOING_AWAY; PS_UNLOCK(ps); /* * The "ps" segment is "off-line" now, @@ -824,10 +912,26 @@ default_pager_backing_store_delete( ps = paging_segments[i]; if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs && - ps->ps_going_away) { + IS_PS_GOING_AWAY(ps)) { PS_LOCK(ps); + + if( !IS_PS_GOING_AWAY(ps)) { + PS_UNLOCK(ps); + continue; + } + /* Handle the special clusters that came in while we let go the lock*/ + if( ps->ps_special_clusters) { + dp_pages_free += ps->ps_special_clusters << ps->ps_clshift; + ps->ps_pgcount += ps->ps_special_clusters << ps->ps_clshift; + ps->ps_clcount += ps->ps_special_clusters; + if ( ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) { + ps_select_array[ps->ps_bs->bs_priority] = 0; + } + ps->ps_special_clusters = 0; + } /* re-enable access to this segment */ - ps->ps_going_away = FALSE; + ps->ps_state &= ~PS_GOING_AWAY; + ps->ps_state |= PS_CAN_USE; PS_UNLOCK(ps); } } @@ -841,13 +945,22 @@ default_pager_backing_store_delete( ps = paging_segments[i]; if (ps != PAGING_SEGMENT_NULL && ps->ps_bs == bs) { - if(ps->ps_going_away) { - paging_segments[i] = PAGING_SEGMENT_NULL; - paging_segment_count--; - PS_LOCK(ps); - kfree((vm_offset_t)ps->ps_bmap, - RMAPSIZE(ps->ps_ncls)); - kfree((vm_offset_t)ps, sizeof *ps); + if(IS_PS_GOING_AWAY(ps)) { + if(IS_PS_EMERGENCY_SEGMENT(ps)) { + PS_LOCK(ps); + ps->ps_state &= ~PS_GOING_AWAY; + ps->ps_special_clusters = 0; + ps->ps_pgcount = ps->ps_pgnum; + ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift; + dp_pages_reserve += ps->ps_pgcount; + PS_UNLOCK(ps); + } else { + paging_segments[i] = PAGING_SEGMENT_NULL; + paging_segment_count--; + PS_LOCK(ps); + kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); + kfree(ps, sizeof *ps); + } } } } @@ -861,6 +974,11 @@ default_pager_backing_store_delete( PSL_UNLOCK(); + if( dealing_with_emergency_segment ) { + BS_UNLOCK(bs); + return KERN_SUCCESS; + } + /* * All the segments have been deleted. * We can remove the backing store. @@ -869,9 +987,9 @@ default_pager_backing_store_delete( /* * Disable lookups of this backing store. */ - if((void *)bs->bs_port->alias != NULL) - kfree((vm_offset_t) bs->bs_port->alias, - sizeof (struct vstruct_alias)); + if((void *)bs->bs_port->ip_alias != NULL) + kfree((void *) bs->bs_port->ip_alias, + sizeof (struct vstruct_alias)); ipc_port_dealloc_kernel((ipc_port_t) (bs->bs_port)); bs->bs_port = MACH_PORT_NULL; BS_UNLOCK(bs); @@ -887,11 +1005,22 @@ default_pager_backing_store_delete( /* * Free the backing store structure. */ - kfree((vm_offset_t)bs, sizeof *bs); + BS_LOCK_DESTROY(bs); + kfree(bs, sizeof *bs); return KERN_SUCCESS; } +kern_return_t +default_pager_backing_store_delete( + MACH_PORT_FACE backing_store) +{ + if( backing_store != emergency_segment_backing_store ) { + default_pager_backing_store_delete_internal(emergency_segment_backing_store); + } + return(default_pager_backing_store_delete_internal(backing_store)); +} + int ps_enter(paging_segment_t); /* forward */ int @@ -986,7 +1115,8 @@ default_pager_add_segment( PS_LOCK_INIT(ps); ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls)); if (!ps->ps_bmap) { - kfree((vm_offset_t)ps, sizeof *ps); + PS_LOCK_DESTROY(ps); + kfree(ps, sizeof *ps); BS_UNLOCK(bs); return KERN_RESOURCE_SHORTAGE; } @@ -994,12 +1124,22 @@ default_pager_add_segment( clrbit(ps->ps_bmap, i); } - ps->ps_going_away = FALSE; + if(paging_segment_count == 0) { + ps->ps_state = PS_EMERGENCY_SEGMENT; + if(use_emergency_swap_file_first) { + ps->ps_state |= PS_CAN_USE; + } + } else { + ps->ps_state = PS_CAN_USE; + } + ps->ps_bs = bs; if ((error = ps_enter(ps)) != 0) { - kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); - kfree((vm_offset_t)ps, sizeof *ps); + kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); + + PS_LOCK_DESTROY(ps); + kfree(ps, sizeof *ps); BS_UNLOCK(bs); return KERN_RESOURCE_SHORTAGE; } @@ -1009,15 +1149,19 @@ default_pager_add_segment( BS_UNLOCK(bs); PSL_LOCK(); - dp_pages_free += ps->ps_pgcount; + if(IS_PS_OK_TO_USE(ps)) { + dp_pages_free += ps->ps_pgcount; + } else { + dp_pages_reserve += ps->ps_pgcount; + } PSL_UNLOCK(); bs_more_space(ps->ps_clcount); - DEBUG(DEBUG_BS_INTERNAL, - ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n", - device, offset, count, record_size, - ps->ps_record_shift, ps->ps_pgnum)); + DP_DEBUG(DEBUG_BS_INTERNAL, + ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n", + device, offset, count, record_size, + ps->ps_record_shift, ps->ps_pgnum)); return KERN_SUCCESS; } @@ -1073,7 +1217,7 @@ vs_alloc_async(void) { struct vs_async *vsa; MACH_PORT_FACE reply_port; - kern_return_t kr; +// kern_return_t kr; VS_ASYNC_LOCK(); if (vs_async_free_list == NULL) { @@ -1091,8 +1235,8 @@ vs_alloc_async(void) kalloc(sizeof (struct vstruct_alias)); if(alias_struct != NULL) { alias_struct->vs = (struct vstruct *)vsa; - alias_struct->name = ISVS; - reply_port->alias = (int) alias_struct; + alias_struct->name = &default_pager_ops; + reply_port->ip_alias = (uintptr_t) alias_struct; vsa->reply_port = reply_port; vs_alloc_async_count++; } @@ -1100,8 +1244,7 @@ vs_alloc_async(void) vs_alloc_async_failed++; ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port)); - kfree((vm_offset_t)vsa, - sizeof (struct vs_async)); + kfree(vsa, sizeof (struct vs_async)); vsa = NULL; } } @@ -1144,8 +1287,8 @@ vs_alloc_async(void) kalloc(sizeof (struct vstruct_alias)); if(alias_struct != NULL) { alias_struct->vs = reply_port; - alias_struct->name = ISVS; - reply_port->alias = (int) vsa; + alias_struct->name = &default_pager_ops; + reply_port->defpager_importance.alias = (int) vsa; vsa->reply_port = reply_port; vs_alloc_async_count++; } @@ -1153,8 +1296,7 @@ vs_alloc_async(void) vs_alloc_async_failed++; ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port)); - kfree((vm_offset_t) vsa, - sizeof (struct vs_async)); + kfree(vsa, sizeof (struct vs_async)); vsa = NULL; } } @@ -1170,8 +1312,8 @@ vs_free_async( kern_return_t kr; reply_port = vsa->reply_port; - kfree((vm_offset_t) reply_port->alias, sizeof (struct vstuct_alias)); - kfree((vm_offset_t) vsa, sizeof (struct vs_async)); + kfree(reply_port->ip_alias, sizeof (struct vstuct_alias)); + kfree(vsa, sizeof (struct vs_async)); ipc_port_dealloc_kernel((MACH_PORT_FACE) (reply_port)); #if 0 VS_ASYNC_LOCK(); @@ -1186,10 +1328,10 @@ zone_t vstruct_zone; vstruct_t ps_vstruct_create( - vm_size_t size) + dp_size_t size) { vstruct_t vs; - int i; + unsigned int i; vs = (vstruct_t) zalloc(vstruct_zone); if (vs == VSTRUCT_NULL) { @@ -1201,23 +1343,15 @@ ps_vstruct_create( /* * The following fields will be provided later. */ - vs->vs_mem_obj = NULL; + vs->vs_pager_ops = NULL; vs->vs_control = MEMORY_OBJECT_CONTROL_NULL; vs->vs_references = 1; vs->vs_seqno = 0; -#ifdef MACH_KERNEL vs->vs_waiting_seqno = FALSE; vs->vs_waiting_read = FALSE; vs->vs_waiting_write = FALSE; vs->vs_waiting_async = FALSE; -#else - mutex_init(&vs->vs_waiting_seqno, ETAP_DPAGE_VSSEQNO); - mutex_init(&vs->vs_waiting_read, ETAP_DPAGE_VSREAD); - mutex_init(&vs->vs_waiting_write, ETAP_DPAGE_VSWRITE); - mutex_init(&vs->vs_waiting_refs, ETAP_DPAGE_VSREFS); - mutex_init(&vs->vs_waiting_async, ETAP_DPAGE_VSASYNC); -#endif vs->vs_readers = 0; vs->vs_writers = 0; @@ -1225,7 +1359,7 @@ ps_vstruct_create( vs->vs_errors = 0; vs->vs_clshift = local_log2(bs_get_global_clsize(0)); - vs->vs_size = ((atop(round_page(size)) - 1) >> vs->vs_clshift) + 1; + vs->vs_size = ((atop_32(round_page_32(size)) - 1) >> vs->vs_clshift) + 1; vs->vs_async_pending = 0; /* @@ -1242,14 +1376,14 @@ ps_vstruct_create( vs->vs_indirect = FALSE; } vs->vs_xfer_pending = FALSE; - DEBUG(DEBUG_VS_INTERNAL, - ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect)); + DP_DEBUG(DEBUG_VS_INTERNAL, + ("map=0x%x, indirect=%d\n", (int) vs->vs_dmap, vs->vs_indirect)); /* * Check to see that we got the space. */ if (!vs->vs_dmap) { - kfree((vm_offset_t)vs, sizeof *vs); + kfree(vs, sizeof *vs); return VSTRUCT_NULL; } @@ -1270,12 +1404,12 @@ ps_vstruct_create( return vs; } -paging_segment_t ps_select_segment(int, int *); /* forward */ +paging_segment_t ps_select_segment(unsigned int, int *); /* forward */ paging_segment_t ps_select_segment( - int shift, - int *psindex) + unsigned int shift, + int *psindex) { paging_segment_t ps; int i; @@ -1288,34 +1422,49 @@ ps_select_segment( PSL_LOCK(); if (paging_segment_count == 1) { - paging_segment_t lps; /* used to avoid extra PS_UNLOCK */ + paging_segment_t lps = PAGING_SEGMENT_NULL; /* used to avoid extra PS_UNLOCK */ ipc_port_t trigger = IP_NULL; ps = paging_segments[paging_segment_max]; *psindex = paging_segment_max; PS_LOCK(ps); - if (ps->ps_going_away) { - /* this segment is being turned off */ - lps = PAGING_SEGMENT_NULL; - } else { - ASSERT(ps->ps_clshift >= shift); + if( !IS_PS_EMERGENCY_SEGMENT(ps) ) { + panic("Emergency paging segment missing\n"); + } + ASSERT(ps->ps_clshift >= shift); + if(IS_PS_OK_TO_USE(ps)) { if (ps->ps_clcount) { ps->ps_clcount--; dp_pages_free -= 1 << ps->ps_clshift; + ps->ps_pgcount -= 1 << ps->ps_clshift; if(min_pages_trigger_port && (dp_pages_free < minimum_pages_remaining)) { trigger = min_pages_trigger_port; min_pages_trigger_port = NULL; bs_low = TRUE; + backing_store_abort_compaction = TRUE; } lps = ps; - } else - lps = PAGING_SEGMENT_NULL; - } + } + } PS_UNLOCK(ps); + + if( lps == PAGING_SEGMENT_NULL ) { + if(dp_pages_free) { + dp_pages_free_drift_count++; + if(dp_pages_free > dp_pages_free_drifted_max) { + dp_pages_free_drifted_max = dp_pages_free; + } + dprintf(("Emergency swap segment:dp_pages_free before zeroing out: %d\n",dp_pages_free)); + } + dp_pages_free = 0; + } + PSL_UNLOCK(); if (trigger != IP_NULL) { + dprintf(("ps_select_segment - send HI_WAT_ALERT\n")); + default_pager_space_alert(trigger, HI_WAT_ALERT); ipc_port_release_send(trigger); } @@ -1323,6 +1472,14 @@ ps_select_segment( } if (paging_segment_count == 0) { + if(dp_pages_free) { + dp_pages_free_drift_count++; + if(dp_pages_free > dp_pages_free_drifted_max) { + dp_pages_free_drifted_max = dp_pages_free; + } + dprintf(("No paging segments:dp_pages_free before zeroing out: %d\n",dp_pages_free)); + } + dp_pages_free = 0; PSL_UNLOCK(); return PAGING_SEGMENT_NULL; } @@ -1364,35 +1521,40 @@ ps_select_segment( * >= that of the vstruct. */ PS_LOCK(ps); - if (ps->ps_going_away) { - /* this segment is being turned off */ - } else if ((ps->ps_clcount) && - (ps->ps_clshift >= shift)) { - ipc_port_t trigger = IP_NULL; - - ps->ps_clcount--; - dp_pages_free -= 1 << ps->ps_clshift; - if(min_pages_trigger_port && - (dp_pages_free < - minimum_pages_remaining)) { - trigger = min_pages_trigger_port; - min_pages_trigger_port = NULL; - } - PS_UNLOCK(ps); - /* - * found one, quit looking. - */ - ps_select_array[i] = j; - PSL_UNLOCK(); - - if (trigger != IP_NULL) { - default_pager_space_alert( - trigger, - HI_WAT_ALERT); - ipc_port_release_send(trigger); + if (IS_PS_OK_TO_USE(ps)) { + if ((ps->ps_clcount) && + (ps->ps_clshift >= shift)) { + ipc_port_t trigger = IP_NULL; + + ps->ps_clcount--; + dp_pages_free -= 1 << ps->ps_clshift; + ps->ps_pgcount -= 1 << ps->ps_clshift; + if(min_pages_trigger_port && + (dp_pages_free < + minimum_pages_remaining)) { + trigger = min_pages_trigger_port; + min_pages_trigger_port = NULL; + bs_low = TRUE; + backing_store_abort_compaction = TRUE; + } + PS_UNLOCK(ps); + /* + * found one, quit looking. + */ + ps_select_array[i] = j; + PSL_UNLOCK(); + + if (trigger != IP_NULL) { + dprintf(("ps_select_segment - send HI_WAT_ALERT\n")); + + default_pager_space_alert( + trigger, + HI_WAT_ALERT); + ipc_port_release_send(trigger); + } + *psindex = j; + return ps; } - *psindex = j; - return ps; } PS_UNLOCK(ps); } @@ -1406,22 +1568,31 @@ ps_select_segment( j++; } } + + if(dp_pages_free) { + dp_pages_free_drift_count++; + if(dp_pages_free > dp_pages_free_drifted_max) { + dp_pages_free_drifted_max = dp_pages_free; + } + dprintf(("%d Paging Segments: dp_pages_free before zeroing out: %d\n",paging_segment_count,dp_pages_free)); + } + dp_pages_free = 0; PSL_UNLOCK(); return PAGING_SEGMENT_NULL; } -vm_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/ +dp_offset_t ps_allocate_cluster(vstruct_t, int *, paging_segment_t); /*forward*/ -vm_offset_t +dp_offset_t ps_allocate_cluster( vstruct_t vs, int *psindex, paging_segment_t use_ps) { - int byte_num; + unsigned int byte_num; int bit_num = 0; paging_segment_t ps; - vm_offset_t cluster; + dp_offset_t cluster; ipc_port_t trigger = IP_NULL; /* @@ -1447,50 +1618,132 @@ ps_allocate_cluster( * This and the ordering of the paging segment "going_away" bit setting * protects us. */ +retry: if (use_ps != PAGING_SEGMENT_NULL) { ps = use_ps; PSL_LOCK(); PS_LOCK(ps); + + ASSERT(ps->ps_clcount != 0); + ps->ps_clcount--; dp_pages_free -= 1 << ps->ps_clshift; + ps->ps_pgcount -= 1 << ps->ps_clshift; if(min_pages_trigger_port && (dp_pages_free < minimum_pages_remaining)) { trigger = min_pages_trigger_port; min_pages_trigger_port = NULL; + bs_low = TRUE; + backing_store_abort_compaction = TRUE; } PSL_UNLOCK(); PS_UNLOCK(ps); if (trigger != IP_NULL) { + dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n")); + default_pager_space_alert(trigger, HI_WAT_ALERT); ipc_port_release_send(trigger); } } else if ((ps = ps_select_segment(vs->vs_clshift, psindex)) == PAGING_SEGMENT_NULL) { -#if 0 - bs_no_paging_space(TRUE); -#endif -#if 0 - if (verbose) -#endif - dprintf(("no space in available paging segments; " - "swapon suggested\n")); - /* the count got off maybe, reset to zero */ + static clock_sec_t lastnotify = 0; + clock_sec_t now; + clock_nsec_t nanoseconds_dummy; + + /* + * Don't immediately jump to the emergency segment. Give the + * dynamic pager a chance to create it's first normal swap file. + * Unless, of course the very first normal swap file can't be + * created due to some problem and we didn't expect that problem + * i.e. use_emergency_swap_file_first was never set to true initially. + * It then gets set in the swap file creation error handling. + */ + if(paging_segment_count > 1 || use_emergency_swap_file_first == TRUE) { + + ps = paging_segments[EMERGENCY_PSEG_INDEX]; + if(IS_PS_EMERGENCY_SEGMENT(ps) && !IS_PS_GOING_AWAY(ps)) { + PSL_LOCK(); + PS_LOCK(ps); + + if(IS_PS_GOING_AWAY(ps)) { + /* Someone de-activated the emergency paging segment*/ + PS_UNLOCK(ps); + PSL_UNLOCK(); + + } else if(dp_pages_free) { + /* + * Someone has already activated the emergency paging segment + * OR + * Between us having rec'd a NULL segment from ps_select_segment + * and reaching here a new normal segment could have been added. + * E.g. we get NULL segment and another thread just added the + * new swap file. Hence check to see if we have more dp_pages_free + * before activating the emergency segment. + */ + PS_UNLOCK(ps); + PSL_UNLOCK(); + goto retry; + + } else if(!IS_PS_OK_TO_USE(ps) && ps->ps_clcount) { + /* + * PS_CAN_USE is only reset from the emergency segment when it's + * been successfully recovered. So it's legal to have an emergency + * segment that has PS_CAN_USE but no clusters because it's recovery + * failed. + */ + backing_store_t bs = ps->ps_bs; + ps->ps_state |= PS_CAN_USE; + if(ps_select_array[bs->bs_priority] == BS_FULLPRI || + ps_select_array[bs->bs_priority] == BS_NOPRI) { + ps_select_array[bs->bs_priority] = 0; + } + dp_pages_free += ps->ps_pgcount; + dp_pages_reserve -= ps->ps_pgcount; + PS_UNLOCK(ps); + PSL_UNLOCK(); + dprintf(("Switching ON Emergency paging segment\n")); + goto retry; + } + + PS_UNLOCK(ps); + PSL_UNLOCK(); + } + } + + /* + * Emit a notification of the low-paging resource condition + * but don't issue it more than once every five seconds. This + * prevents us from overflowing logs with thousands of + * repetitions of the message. + */ + clock_get_system_nanotime(&now, &nanoseconds_dummy); + if (paging_segment_count > 1 && (now > lastnotify + 5)) { + /* With an activated emergency paging segment we still + * didn't get any clusters. This could mean that the + * emergency paging segment is exhausted. + */ + dprintf(("System is out of paging space.\n")); + lastnotify = now; + } + PSL_LOCK(); - dp_pages_free = 0; + if(min_pages_trigger_port) { trigger = min_pages_trigger_port; min_pages_trigger_port = NULL; bs_low = TRUE; + backing_store_abort_compaction = TRUE; } PSL_UNLOCK(); if (trigger != IP_NULL) { + dprintf(("ps_allocate_cluster - send HI_WAT_ALERT\n")); + default_pager_space_alert(trigger, HI_WAT_ALERT); ipc_port_release_send(trigger); } - return (vm_offset_t) -1; + return (dp_offset_t) -1; } - ASSERT(ps->ps_clcount != 0); /* * Look for an available cluster. At the end of the loop, @@ -1521,16 +1774,15 @@ ps_allocate_cluster( return cluster; } -void ps_deallocate_cluster(paging_segment_t, vm_offset_t); /* forward */ +void ps_deallocate_cluster(paging_segment_t, dp_offset_t); /* forward */ void ps_deallocate_cluster( paging_segment_t ps, - vm_offset_t cluster) + dp_offset_t cluster) { - ipc_port_t trigger = IP_NULL; - if (cluster >= (vm_offset_t) ps->ps_ncls) + if (cluster >= ps->ps_ncls) panic("ps_deallocate_cluster: Invalid cluster number"); /* @@ -1540,15 +1792,13 @@ ps_deallocate_cluster( PSL_LOCK(); PS_LOCK(ps); clrbit(ps->ps_bmap, cluster); - ++ps->ps_clcount; - dp_pages_free += 1 << ps->ps_clshift; - if(max_pages_trigger_port - && (backing_store_release_trigger_disable == 0) - && (dp_pages_free > maximum_pages_free)) { - trigger = max_pages_trigger_port; - max_pages_trigger_port = NULL; + if( IS_PS_OK_TO_USE(ps)) { + ++ps->ps_clcount; + ps->ps_pgcount += 1 << ps->ps_clshift; + dp_pages_free += 1 << ps->ps_clshift; + } else { + ps->ps_special_clusters += 1; } - PSL_UNLOCK(); /* * Move the hint down to the freed cluster if it is @@ -1558,54 +1808,51 @@ ps_deallocate_cluster( ps->ps_hint = (cluster/NBBY); } - PS_UNLOCK(ps); /* * If we're freeing space on a full priority, reset the array. */ - PSL_LOCK(); - if (ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) + if ( IS_PS_OK_TO_USE(ps) && ps_select_array[ps->ps_bs->bs_priority] == BS_FULLPRI) ps_select_array[ps->ps_bs->bs_priority] = 0; + PS_UNLOCK(ps); PSL_UNLOCK(); - if (trigger != IP_NULL) { - VSL_LOCK(); - if(backing_store_release_trigger_disable != 0) { - assert_wait((event_t) - &backing_store_release_trigger_disable, - THREAD_UNINT); - VSL_UNLOCK(); - thread_block(THREAD_CONTINUE_NULL); - } else { - VSL_UNLOCK(); - } - default_pager_space_alert(trigger, LO_WAT_ALERT); - ipc_port_release_send(trigger); - } - return; } -void ps_dealloc_vsmap(struct vs_map *, vm_size_t); /* forward */ +void ps_dealloc_vsmap(struct vs_map *, dp_size_t); /* forward */ void ps_dealloc_vsmap( struct vs_map *vsmap, - vm_size_t size) + dp_size_t size) { - int i; - for (i = 0; i < size; i++) - if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) + unsigned int i; + struct ps_vnode_trim_data trim_data; + + ps_vnode_trim_init(&trim_data); + + for (i = 0; i < size; i++) { + if (!VSM_ISCLR(vsmap[i]) && !VSM_ISERR(vsmap[i])) { + ps_vnode_trim_more(&trim_data, + &vsmap[i], + VSM_PS(vsmap[i])->ps_clshift, + vm_page_size << VSM_PS(vsmap[i])->ps_clshift); ps_deallocate_cluster(VSM_PS(vsmap[i]), VSM_CLOFF(vsmap[i])); + } else { + ps_vnode_trim_now(&trim_data); + } + } + ps_vnode_trim_now(&trim_data); } void ps_vstruct_dealloc( vstruct_t vs) { - int i; - spl_t s; + unsigned int i; +// spl_t s; VS_MAP_LOCK(vs); @@ -1622,31 +1869,160 @@ ps_vstruct_dealloc( for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { if (vs->vs_imap[i] != NULL) { ps_dealloc_vsmap(vs->vs_imap[i], CLMAP_ENTRIES); - kfree((vm_offset_t)vs->vs_imap[i], - CLMAP_THRESHOLD); + kfree(vs->vs_imap[i], CLMAP_THRESHOLD); } } - kfree((vm_offset_t)vs->vs_imap, - INDIRECT_CLMAP_SIZE(vs->vs_size)); + kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size)); } else { /* * Direct map. Free used clusters, then memory. */ ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size); - kfree((vm_offset_t)vs->vs_dmap, CLMAP_SIZE(vs->vs_size)); + kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size)); } VS_MAP_UNLOCK(vs); bs_commit(- vs->vs_size); - zfree(vstruct_zone, (vm_offset_t)vs); + VS_MAP_LOCK_DESTROY(vs); + + zfree(vstruct_zone, vs); +} + +kern_return_t +ps_vstruct_reclaim( + vstruct_t vs, + boolean_t return_to_vm, + boolean_t reclaim_backing_store) +{ + unsigned int i, j; + struct vs_map *vsmap; + boolean_t vsmap_all_clear, vsimap_all_clear; + struct vm_object_fault_info fault_info; + int clmap_off; + unsigned int vsmap_size; + kern_return_t kr = KERN_SUCCESS; + + VS_MAP_LOCK(vs); + + fault_info.cluster_size = VM_SUPER_CLUSTER; + fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; + fault_info.user_tag = 0; + fault_info.pmap_options = 0; + fault_info.lo_offset = 0; + fault_info.hi_offset = ptoa_32(vs->vs_size << vs->vs_clshift); + fault_info.io_sync = reclaim_backing_store; + fault_info.batch_pmap_op = FALSE; + + /* + * If this is an indirect structure, then we walk through the valid + * (non-zero) indirect pointers and deallocate the clusters + * associated with each used map entry (via ps_dealloc_vsmap). + * When all of the clusters in an indirect block have been + * freed, we deallocate the block. When all of the indirect + * blocks have been deallocated we deallocate the memory + * holding the indirect pointers. + */ + if (vs->vs_indirect) { + vsimap_all_clear = TRUE; + for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { + vsmap = vs->vs_imap[i]; + if (vsmap == NULL) + continue; + /* loop on clusters in this indirect map */ + clmap_off = (vm_page_size * CLMAP_ENTRIES * + VSCLSIZE(vs) * i); + if (i+1 == INDIRECT_CLMAP_ENTRIES(vs->vs_size)) + vsmap_size = vs->vs_size - (CLMAP_ENTRIES * i); + else + vsmap_size = CLMAP_ENTRIES; + vsmap_all_clear = TRUE; + if (return_to_vm) { + for (j = 0; j < vsmap_size;) { + if (VSM_ISCLR(vsmap[j]) || + VSM_ISERR(vsmap[j])) { + j++; + clmap_off += vm_page_size * VSCLSIZE(vs); + continue; + } + VS_MAP_UNLOCK(vs); + kr = pvs_cluster_read( + vs, + clmap_off, + (dp_size_t) -1, /* read whole cluster */ + &fault_info); + + VS_MAP_LOCK(vs); /* XXX what if it changed ? */ + if (kr != KERN_SUCCESS) { + vsmap_all_clear = FALSE; + vsimap_all_clear = FALSE; + + kr = KERN_MEMORY_ERROR; + goto out; + } + } + } + if (vsmap_all_clear) { + ps_dealloc_vsmap(vsmap, CLMAP_ENTRIES); + kfree(vsmap, CLMAP_THRESHOLD); + vs->vs_imap[i] = NULL; + } + } + if (vsimap_all_clear) { +// kfree(vs->vs_imap, INDIRECT_CLMAP_SIZE(vs->vs_size)); + } + } else { + /* + * Direct map. Free used clusters, then memory. + */ + vsmap = vs->vs_dmap; + if (vsmap == NULL) { + goto out; + } + vsmap_all_clear = TRUE; + /* loop on clusters in the direct map */ + if (return_to_vm) { + for (j = 0; j < vs->vs_size;) { + if (VSM_ISCLR(vsmap[j]) || + VSM_ISERR(vsmap[j])) { + j++; + continue; + } + clmap_off = vm_page_size * (j << vs->vs_clshift); + VS_MAP_UNLOCK(vs); + kr = pvs_cluster_read( + vs, + clmap_off, + (dp_size_t) -1, /* read whole cluster */ + &fault_info); + + VS_MAP_LOCK(vs); /* XXX what if it changed ? */ + if (kr != KERN_SUCCESS) { + vsmap_all_clear = FALSE; + + kr = KERN_MEMORY_ERROR; + goto out; + } else { +// VSM_CLR(vsmap[j]); + } + } + } + if (vsmap_all_clear) { + ps_dealloc_vsmap(vs->vs_dmap, vs->vs_size); +// kfree(vs->vs_dmap, CLMAP_SIZE(vs->vs_size)); + } + } +out: + VS_MAP_UNLOCK(vs); + + return kr; } -int ps_map_extend(vstruct_t, int); /* forward */ +int ps_map_extend(vstruct_t, unsigned int); /* forward */ int ps_map_extend( vstruct_t vs, - int new_size) + unsigned int new_size) { struct vs_map **new_imap; struct vs_map *new_dmap = NULL; @@ -1708,7 +2084,7 @@ int ps_map_extend( /* Allocate an indirect page */ if ((new_imap[0] = (struct vs_map *) kalloc(CLMAP_THRESHOLD)) == NULL) { - kfree((vm_offset_t)new_imap, new_map_size); + kfree(new_imap, new_map_size); return -1; } new_dmap = new_imap[0]; @@ -1749,29 +2125,29 @@ int ps_map_extend( bs_commit(new_size - vs->vs_size); vs->vs_size = new_size; if (old_map) - kfree((vm_offset_t)old_map, old_map_size); + kfree(old_map, old_map_size); return 0; } -vm_offset_t +dp_offset_t ps_clmap( vstruct_t vs, - vm_offset_t offset, + dp_offset_t offset, struct clmap *clmap, int flag, - vm_size_t size, + dp_size_t size, int error) { - vm_offset_t cluster; /* The cluster of offset. */ - vm_offset_t newcl; /* The new cluster allocated. */ - vm_offset_t newoff; - int i; + dp_offset_t cluster; /* The cluster of offset. */ + dp_offset_t newcl; /* The new cluster allocated. */ + dp_offset_t newoff; + unsigned int i; struct vs_map *vsmap; VS_MAP_LOCK(vs); ASSERT(vs->vs_dmap); - cluster = atop(offset) >> vs->vs_clshift; + cluster = atop_32(offset) >> vs->vs_clshift; /* * Initialize cluster error value @@ -1785,11 +2161,11 @@ ps_clmap( if (flag == CL_FIND) { /* Do not allocate if just doing a lookup */ VS_MAP_UNLOCK(vs); - return (vm_offset_t) -1; + return (dp_offset_t) -1; } if (ps_map_extend(vs, cluster + 1)) { VS_MAP_UNLOCK(vs); - return (vm_offset_t) -1; + return (dp_offset_t) -1; } } @@ -1811,14 +2187,14 @@ ps_clmap( if (vsmap == NULL) { if (flag == CL_FIND) { VS_MAP_UNLOCK(vs); - return (vm_offset_t) -1; + return (dp_offset_t) -1; } /* Allocate the indirect block */ vsmap = (struct vs_map *) kalloc(CLMAP_THRESHOLD); if (vsmap == NULL) { VS_MAP_UNLOCK(vs); - return (vm_offset_t) -1; + return (dp_offset_t) -1; } /* Initialize the cluster offsets */ for (i = 0; i < CLMAP_ENTRIES; i++) @@ -1842,7 +2218,7 @@ ps_clmap( if (VSM_ISERR(*vsmap)) { clmap->cl_error = VSM_GETERR(*vsmap); VS_MAP_UNLOCK(vs); - return (vm_offset_t) -1; + return (dp_offset_t) -1; } else if (VSM_ISCLR(*vsmap)) { int psindex; @@ -1856,16 +2232,16 @@ ps_clmap( VSM_SETERR(*vsmap, error); } VS_MAP_UNLOCK(vs); - return (vm_offset_t) -1; + return (dp_offset_t) -1; } else { /* * Attempt to allocate a cluster from the paging segment */ newcl = ps_allocate_cluster(vs, &psindex, PAGING_SEGMENT_NULL); - if (newcl == -1) { + if (newcl == (dp_offset_t) -1) { VS_MAP_UNLOCK(vs); - return (vm_offset_t) -1; + return (dp_offset_t) -1; } VSM_CLR(*vsmap); VSM_SETCLOFF(*vsmap, newcl); @@ -1887,14 +2263,14 @@ ps_clmap( * relatively quick. */ ASSERT(trunc_page(offset) == offset); - newcl = ptoa(newcl) << vs->vs_clshift; + newcl = ptoa_32(newcl) << vs->vs_clshift; newoff = offset & ((1<<(vm_page_shift + vs->vs_clshift)) - 1); if (flag == CL_ALLOC) { /* * set bits in the allocation bitmap according to which * pages were requested. size is in bytes. */ - i = atop(newoff); + i = atop_32(newoff); while ((size > 0) && (i < VSCLSIZE(vs))) { VSM_SETALLOC(*vsmap, i); i++; @@ -1907,7 +2283,7 @@ ps_clmap( * Offset is not cluster aligned, so number of pages * and bitmaps must be adjusted */ - clmap->cl_numpages -= atop(newoff); + clmap->cl_numpages -= atop_32(newoff); CLMAP_SHIFT(clmap, vs); CLMAP_SHIFTALLOC(clmap, vs); } @@ -1924,7 +2300,7 @@ ps_clmap( * entire cluster is in error. */ if (size && flag == CL_FIND) { - vm_offset_t off = (vm_offset_t) 0; + dp_offset_t off = (dp_offset_t) 0; if (!error) { for (i = VSCLSIZE(vs) - clmap->cl_numpages; size > 0; @@ -1936,7 +2312,7 @@ ps_clmap( } else { BS_STAT(clmap->cl_ps->ps_bs, clmap->cl_ps->ps_bs->bs_pages_out_fail += - atop(size)); + atop_32(size)); off = VSM_CLOFF(*vsmap); VSM_SETERR(*vsmap, error); } @@ -1944,34 +2320,37 @@ ps_clmap( * Deallocate cluster if error, and no valid pages * already present. */ - if (off != (vm_offset_t) 0) + if (off != (dp_offset_t) 0) ps_deallocate_cluster(clmap->cl_ps, off); VS_MAP_UNLOCK(vs); - return (vm_offset_t) 0; + return (dp_offset_t) 0; } else VS_MAP_UNLOCK(vs); - DEBUG(DEBUG_VS_INTERNAL, - ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n", - newcl+newoff, (int) vs, (int) vsmap, flag)); - DEBUG(DEBUG_VS_INTERNAL, - (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n", - (int) clmap->cl_ps, clmap->cl_numpages, - (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map)); + DP_DEBUG(DEBUG_VS_INTERNAL, + ("returning 0x%X,vs=0x%X,vsmap=0x%X,flag=%d\n", + newcl+newoff, (int) vs, (int) vsmap, flag)); + DP_DEBUG(DEBUG_VS_INTERNAL, + (" clmap->cl_ps=0x%X,cl_numpages=%d,clbmap=0x%x,cl_alloc=%x\n", + (int) clmap->cl_ps, clmap->cl_numpages, + (int) clmap->cl_bmap.clb_map, (int) clmap->cl_alloc.clb_map)); return (newcl + newoff); } -void ps_clunmap(vstruct_t, vm_offset_t, vm_size_t); /* forward */ +void ps_clunmap(vstruct_t, dp_offset_t, dp_size_t); /* forward */ void ps_clunmap( vstruct_t vs, - vm_offset_t offset, - vm_size_t length) + dp_offset_t offset, + dp_size_t length) { - vm_offset_t cluster; /* The cluster number of offset */ + dp_offset_t cluster; /* The cluster number of offset */ struct vs_map *vsmap; + struct ps_vnode_trim_data trim_data; + + ps_vnode_trim_init(&trim_data); VS_MAP_LOCK(vs); @@ -1980,20 +2359,22 @@ ps_clunmap( * clusters and map entries as encountered. */ while (length > 0) { - vm_offset_t newoff; - int i; + dp_offset_t newoff; + unsigned int i; - cluster = atop(offset) >> vs->vs_clshift; + cluster = atop_32(offset) >> vs->vs_clshift; if (vs->vs_indirect) /* indirect map */ vsmap = vs->vs_imap[cluster/CLMAP_ENTRIES]; else vsmap = vs->vs_dmap; if (vsmap == NULL) { + ps_vnode_trim_now(&trim_data); VS_MAP_UNLOCK(vs); return; } vsmap += cluster%CLMAP_ENTRIES; if (VSM_ISCLR(*vsmap)) { + ps_vnode_trim_now(&trim_data); length -= vm_page_size; offset += vm_page_size; continue; @@ -2003,12 +2384,12 @@ ps_clunmap( * paging segment cluster pages. * Optimize for entire cluster cleraing. */ - if (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) { + if ( (newoff = (offset&((1<<(vm_page_shift+vs->vs_clshift))-1))) ) { /* * Not cluster aligned. */ ASSERT(trunc_page(newoff) == newoff); - i = atop(newoff); + i = atop_32(newoff); } else i = 0; while ((i < VSCLSIZE(vs)) && (length > 0)) { @@ -2022,23 +2403,30 @@ ps_clunmap( /* * If map entry is empty, clear and deallocate cluster. */ - if (!VSM_ALLOC(*vsmap)) { + if (!VSM_BMAP(*vsmap)) { + ps_vnode_trim_more(&trim_data, + vsmap, + vs->vs_clshift, + VSCLSIZE(vs) * vm_page_size); ps_deallocate_cluster(VSM_PS(*vsmap), VSM_CLOFF(*vsmap)); VSM_CLR(*vsmap); + } else { + ps_vnode_trim_now(&trim_data); } } + ps_vnode_trim_now(&trim_data); VS_MAP_UNLOCK(vs); } -void ps_vs_write_complete(vstruct_t, vm_offset_t, vm_size_t, int); /* forward */ +void ps_vs_write_complete(vstruct_t, dp_offset_t, dp_size_t, int); /* forward */ void ps_vs_write_complete( vstruct_t vs, - vm_offset_t offset, - vm_size_t size, + dp_offset_t offset, + dp_size_t size, int error) { struct clmap clmap; @@ -2056,19 +2444,19 @@ ps_vs_write_complete( (void) ps_clmap(vs, offset, &clmap, CL_FIND, size, error); } -void vs_cl_write_complete(vstruct_t, paging_segment_t, vm_offset_t, vm_offset_t, vm_size_t, boolean_t, int); /* forward */ +void vs_cl_write_complete(vstruct_t, paging_segment_t, dp_offset_t, vm_offset_t, dp_size_t, boolean_t, int); /* forward */ void vs_cl_write_complete( - vstruct_t vs, - paging_segment_t ps, - vm_offset_t offset, - vm_offset_t addr, - vm_size_t size, - boolean_t async, - int error) + vstruct_t vs, + __unused paging_segment_t ps, + dp_offset_t offset, + __unused vm_offset_t addr, + dp_size_t size, + boolean_t async, + int error) { - kern_return_t kr; +// kern_return_t kr; if (error) { /* @@ -2079,7 +2467,7 @@ vs_cl_write_complete( dprintf(("write failed error = 0x%x\n", error)); /* add upl_abort code here */ } else - GSTAT(global_stats.gs_pages_out += atop(size)); + GSTAT(global_stats.gs_pages_out += atop_32(size)); /* * Notify the vstruct mapping code, so it can do its accounting. */ @@ -2092,7 +2480,6 @@ vs_cl_write_complete( if (vs->vs_async_pending == 0 && vs->vs_waiting_async) { vs->vs_waiting_async = FALSE; VS_UNLOCK(vs); - /* mutex_unlock(&vs->vs_waiting_async); */ thread_wakeup(&vs->vs_async_pending); } else { VS_UNLOCK(vs); @@ -2112,7 +2499,7 @@ device_write_reply( struct vs_async *vsa; vsa = (struct vs_async *) - ((struct vstruct_alias *)(reply_port->alias))->vs; + ((struct vstruct_alias *)(reply_port->ip_alias))->vs; if (device_code == KERN_SUCCESS && bytes_written != vsa->vsa_size) { device_code = KERN_FAILURE; @@ -2127,11 +2514,7 @@ device_write_reply( if(vsa->vsa_error) { /* need to consider error condition. re-write data or */ /* throw it away here. */ - vm_offset_t ioaddr; - if(vm_map_copyout(kernel_map, &ioaddr, - (vm_map_copy_t)vsa->vsa_addr) != KERN_SUCCESS) - panic("vs_cluster_write: unable to copy source list\n"); - vm_deallocate(kernel_map, ioaddr, vsa->vsa_size); + vm_map_copy_discard((vm_map_copy_t)vsa->vsa_addr); } ps_vs_write_complete(vsa->vsa_vs, vsa->vsa_offset, vsa->vsa_size, vsa->vsa_error); @@ -2166,11 +2549,11 @@ device_read_reply( { struct vs_async *vsa; vsa = (struct vs_async *) - ((struct vstruct_alias *)(reply_port->alias))->vs; + ((struct vstruct_alias *)(reply_port->defpager_importance.alias))->vs; vsa->vsa_addr = (vm_offset_t)data; vsa->vsa_size = (vm_size_t)dataCnt; vsa->vsa_error = return_code; - thread_wakeup(&vsa->vsa_lock); + thread_wakeup(&vsa); return KERN_SUCCESS; } @@ -2208,12 +2591,10 @@ device_open_reply( return KERN_SUCCESS; } -kern_return_t ps_read_device(paging_segment_t, vm_offset_t, vm_offset_t *, unsigned int, unsigned int *, int); /* forward */ - kern_return_t ps_read_device( paging_segment_t ps, - vm_offset_t offset, + dp_offset_t offset, vm_offset_t *bufferp, unsigned int size, unsigned int *residualp, @@ -2228,14 +2609,13 @@ ps_read_device( vm_offset_t buf_ptr; unsigned int records_read; struct vs_async *vsa; - mutex_t vs_waiting_read_reply; device_t device; vm_map_copy_t device_data = NULL; default_pager_thread_t *dpt = NULL; device = dev_port_lookup(ps->ps_device); - clustered_reads[atop(size)]++; + clustered_reads[atop_32(size)]++; dev_offset = (ps->ps_offset + (offset >> (vm_page_shift - ps->ps_record_shift))); @@ -2252,7 +2632,6 @@ ps_read_device( vsa->vsa_size = 0; vsa->vsa_ps = NULL; } - mutex_init(&vsa->vsa_lock, ETAP_DPAGE_VSSEQNO); ip_lock(vsa->reply_port); vsa->reply_port->ip_sorights++; ip_reference(vsa->reply_port); @@ -2268,7 +2647,7 @@ ps_read_device( (io_buf_ptr_t *) &dev_buffer, (mach_msg_type_number_t *) &bytes_read); if(kr == MIG_NO_REPLY) { - assert_wait(&vsa->vsa_lock, THREAD_UNINT); + assert_wait(&vsa, THREAD_UNINT); thread_block(THREAD_CONTINUE_NULL); dev_buffer = vsa->vsa_addr; @@ -2308,9 +2687,9 @@ ps_read_device( records_read = (bytes_read >> (vm_page_shift - ps->ps_record_shift)); dev_offset += records_read; - DEBUG(DEBUG_VS_INTERNAL, - ("calling vm_deallocate(addr=0x%X,size=0x%X)\n", - dev_buffer, bytes_read)); + DP_DEBUG(DEBUG_VS_INTERNAL, + ("calling vm_deallocate(addr=0x%X,size=0x%X)\n", + dev_buffer, bytes_read)); if (vm_deallocate(kernel_map, dev_buffer, bytes_read) != KERN_SUCCESS) Panic("dealloc buf"); @@ -2319,7 +2698,7 @@ ps_read_device( *residualp = size - total_read; if((dev_buffer != *bufferp) && (total_read != 0)) { vm_offset_t temp_buffer; - vm_allocate(kernel_map, &temp_buffer, total_read, TRUE); + vm_allocate(kernel_map, &temp_buffer, total_read, VM_FLAGS_ANYWHERE); memcpy((void *) temp_buffer, (void *) *bufferp, total_read); if(vm_map_copyin_page_list(kernel_map, temp_buffer, total_read, VM_MAP_COPYIN_OPT_SRC_DESTROY | @@ -2349,12 +2728,10 @@ ps_read_device( return KERN_SUCCESS; } -kern_return_t ps_write_device(paging_segment_t, vm_offset_t, vm_offset_t, unsigned int, struct vs_async *); /* forward */ - kern_return_t ps_write_device( paging_segment_t ps, - vm_offset_t offset, + dp_offset_t offset, vm_offset_t addr, unsigned int size, struct vs_async *vsa) @@ -2367,7 +2744,7 @@ ps_write_device( - clustered_writes[atop(size)]++; + clustered_writes[atop_32(size)]++; dev_offset = (ps->ps_offset + (offset >> (vm_page_shift - ps->ps_record_shift))); @@ -2404,7 +2781,7 @@ ps_write_device( "device_write_request returned ", kr, addr, size, offset)); BS_STAT(ps->ps_bs, - ps->ps_bs->bs_pages_out_fail += atop(size)); + ps->ps_bs->bs_pages_out_fail += atop_32(size)); /* do the completion notification to free resources */ device_write_reply(reply_port, kr, 0); return PAGER_ERROR; @@ -2430,7 +2807,7 @@ ps_write_device( "device_write returned ", kr, addr, size, offset)); BS_STAT(ps->ps_bs, - ps->ps_bs->bs_pages_out_fail += atop(size)); + ps->ps_bs->bs_pages_out_fail += atop_32(size)); return PAGER_ERROR; } if (bytes_written & ((vm_page_size >> ps->ps_record_shift) - 1)) @@ -2456,116 +2833,287 @@ ps_write_device( kern_return_t ps_read_device( - paging_segment_t ps, - vm_offset_t offset, - vm_offset_t *bufferp, - unsigned int size, - unsigned int *residualp, - int flags) + __unused paging_segment_t ps, + __unused dp_offset_t offset, + __unused vm_offset_t *bufferp, + __unused unsigned int size, + __unused unsigned int *residualp, + __unused int flags) { panic("ps_read_device not supported"); + return KERN_FAILURE; } +kern_return_t ps_write_device( - paging_segment_t ps, - vm_offset_t offset, - vm_offset_t addr, - unsigned int size, - struct vs_async *vsa) + __unused paging_segment_t ps, + __unused dp_offset_t offset, + __unused vm_offset_t addr, + __unused unsigned int size, + __unused struct vs_async *vsa) { panic("ps_write_device not supported"); + return KERN_FAILURE; } #endif /* DEVICE_PAGING */ -void pvs_object_data_provided(vstruct_t, upl_t, vm_offset_t, vm_size_t); /* forward */ +void pvs_object_data_provided(vstruct_t, upl_t, upl_offset_t, upl_size_t); /* forward */ void pvs_object_data_provided( - vstruct_t vs, - upl_t upl, - vm_offset_t offset, - vm_size_t size) + __unused vstruct_t vs, + __unused upl_t upl, + __unused upl_offset_t offset, + upl_size_t size) { +#if RECLAIM_SWAP + boolean_t empty; +#endif - DEBUG(DEBUG_VS_INTERNAL, - ("buffer=0x%x,offset=0x%x,size=0x%x\n", - upl, offset, size)); + DP_DEBUG(DEBUG_VS_INTERNAL, + ("buffer=0x%x,offset=0x%x,size=0x%x\n", + upl, offset, size)); ASSERT(size > 0); - GSTAT(global_stats.gs_pages_in += atop(size)); - - -#if USE_PRECIOUS - ps_clunmap(vs, offset, size); -#endif /* USE_PRECIOUS */ + GSTAT(global_stats.gs_pages_in += atop_32(size)); + +/* check upl iosync flag instead of using RECLAIM_SWAP*/ +#if RECLAIM_SWAP + if (size != upl->size) { + if (size) { + ps_clunmap(vs, offset, size); + upl_commit_range(upl, 0, size, 0, NULL, 0, &empty); + } + upl_abort(upl, UPL_ABORT_ERROR); + upl_deallocate(upl); + } else { + ps_clunmap(vs, offset, size); + upl_commit(upl, NULL, 0); + upl_deallocate(upl); + } +#endif /* RECLAIM_SWAP */ } +static memory_object_offset_t last_start; +static vm_size_t last_length; + +/* + * A "cnt" of 0 means that the caller just wants to check if the page at + * offset "vs_offset" exists in the backing store. That page hasn't been + * prepared, so no need to release it. + * + * A "cnt" of -1 means that the caller wants to bring back from the backing + * store all existing pages in the cluster containing "vs_offset". + */ kern_return_t pvs_cluster_read( vstruct_t vs, - vm_offset_t vs_offset, - vm_size_t cnt) + dp_offset_t vs_offset, + dp_size_t cnt, + void *fault_info) { - upl_t upl; kern_return_t error = KERN_SUCCESS; - int size; + unsigned int size; unsigned int residual; unsigned int request_flags; - int seg_index; - int pages_in_cl; + int io_flags = 0; + int seg_index; + int pages_in_cl; int cl_size; int cl_mask; - int cl_index; - int xfer_size; - vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT]; - paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT]; + int cl_index; + unsigned int xfer_size; + dp_offset_t orig_vs_offset; + dp_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT]; + paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT]; struct clmap clmap; + upl_t upl; + unsigned int page_list_count; + memory_object_offset_t cluster_start; + vm_size_t cluster_length; + uint32_t io_streaming; + int i; + boolean_t io_sync = FALSE; + boolean_t reclaim_all = FALSE; pages_in_cl = 1 << vs->vs_clshift; cl_size = pages_in_cl * vm_page_size; cl_mask = cl_size - 1; + request_flags = UPL_NO_SYNC | UPL_RET_ONLY_ABSENT | UPL_SET_LITE; + + if (cnt == (dp_size_t) -1) + reclaim_all = TRUE; + + if (reclaim_all == TRUE) { + /* + * We've been called from ps_vstruct_reclaim() to move all + * the object's swapped pages back to VM pages. + * This can put memory pressure on the system, so we do want + * to wait for free pages, to avoid getting in the way of the + * vm_pageout_scan() thread. + * Let's not use UPL_NOBLOCK in this case. + */ + vs_offset &= ~cl_mask; + i = pages_in_cl; + } else { + i = 1; + + /* + * if the I/O cluster size == PAGE_SIZE, we don't want to set + * the UPL_NOBLOCK since we may be trying to recover from a + * previous partial pagein I/O that occurred because we were low + * on memory and bailed early in order to honor the UPL_NOBLOCK... + * since we're only asking for a single page, we can block w/o fear + * of tying up pages while waiting for more to become available + */ + if (fault_info == NULL || ((vm_object_fault_info_t)fault_info)->cluster_size > PAGE_SIZE) + request_flags |= UPL_NOBLOCK; + } + +again: + cl_index = (vs_offset & cl_mask) / vm_page_size; + + if ((ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0) == (dp_offset_t)-1) || + !CLMAP_ISSET(clmap, cl_index)) { + /* + * the needed page doesn't exist in the backing store... + * we don't want to try to do any I/O, just abort the + * page and let the fault handler provide a zero-fill + */ + if (cnt == 0) { + /* + * The caller was just poking at us to see if + * the page has been paged out. No need to + * mess with the page at all. + * Just let the caller know we don't have that page. + */ + return KERN_FAILURE; + } + if (reclaim_all == TRUE) { + i--; + if (i == 0) { + /* no more pages in this cluster */ + return KERN_FAILURE; + } + /* try the next page in this cluster */ + vs_offset += vm_page_size; + goto again; + } + + page_list_count = 0; + + memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset, + PAGE_SIZE, PAGE_SIZE, + &upl, NULL, &page_list_count, + request_flags | UPL_SET_INTERNAL); + upl_range_needed(upl, 0, 1); + + if (clmap.cl_error) + upl_abort(upl, UPL_ABORT_ERROR); + else + upl_abort(upl, UPL_ABORT_UNAVAILABLE); + upl_deallocate(upl); + + return KERN_SUCCESS; + } + + if (cnt == 0) { + /* + * The caller was just poking at us to see if + * the page has been paged out. No need to + * mess with the page at all. + * Just let the caller know we do have that page. + */ + return KERN_SUCCESS; + } + + if(((vm_object_fault_info_t)fault_info)->io_sync == TRUE ) { + io_sync = TRUE; + } else { +#if RECLAIM_SWAP + io_sync = TRUE; +#endif /* RECLAIM_SWAP */ + } + + if( io_sync == TRUE ) { + + io_flags |= UPL_IOSYNC | UPL_NOCOMMIT; +#if USE_PRECIOUS + request_flags |= UPL_PRECIOUS | UPL_CLEAN_IN_PLACE; +#else /* USE_PRECIOUS */ + request_flags |= UPL_REQUEST_SET_DIRTY; +#endif /* USE_PRECIOUS */ + } + + assert(dp_encryption_inited); + if (dp_encryption) { + /* + * ENCRYPTED SWAP: + * request that the UPL be prepared for + * decryption. + */ + request_flags |= UPL_ENCRYPT; + io_flags |= UPL_PAGING_ENCRYPTED; + } + orig_vs_offset = vs_offset; + + assert(cnt != 0); + cnt = VM_SUPER_CLUSTER; + cluster_start = (memory_object_offset_t) vs_offset; + cluster_length = (vm_size_t) cnt; + io_streaming = 0; + + /* + * determine how big a speculative I/O we should try for... + */ + if (memory_object_cluster_size(vs->vs_control, &cluster_start, &cluster_length, &io_streaming, (memory_object_fault_info_t)fault_info) == KERN_SUCCESS) { + assert(vs_offset >= (dp_offset_t) cluster_start && + vs_offset < (dp_offset_t) (cluster_start + cluster_length)); + vs_offset = (dp_offset_t) cluster_start; + cnt = (dp_size_t) cluster_length; + } else { + cluster_length = PAGE_SIZE; + cnt = PAGE_SIZE; + } + + if (io_streaming) + io_flags |= UPL_IOSTREAMING; + + last_start = cluster_start; + last_length = cluster_length; + /* * This loop will be executed multiple times until the entire - * request has been satisfied... if the request spans cluster + * range has been looked at or we issue an I/O... if the request spans cluster * boundaries, the clusters will be checked for logical continunity, - * if contiguous the I/O request will span multiple clusters, otherwise - * it will be broken up into the minimal set of I/O's - * - * If there are holes in a request (either unallocated pages in a paging - * segment or an unallocated paging segment), we stop - * reading at the hole, inform the VM of any data read, inform - * the VM of an unavailable range, then loop again, hoping to - * find valid pages later in the requested range. This continues until - * the entire range has been examined, and read, if present. + * if contiguous the I/O request will span multiple clusters... + * at most only 1 I/O will be issued... it will encompass the original offset */ - -#if USE_PRECIOUS - request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_PRECIOUS | UPL_RET_ONLY_ABSENT; -#else - request_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_RET_ONLY_ABSENT; -#endif - while (cnt && (error == KERN_SUCCESS)) { + while (cnt && error == KERN_SUCCESS) { int ps_info_valid; - int page_list_count; - if (cnt > VM_SUPER_CLUSTER) + if ((vs_offset & cl_mask) && (cnt > (VM_SUPER_CLUSTER - (vs_offset & cl_mask)))) { + size = VM_SUPER_CLUSTER; + size -= vs_offset & cl_mask; + } else if (cnt > VM_SUPER_CLUSTER) size = VM_SUPER_CLUSTER; else size = cnt; + cnt -= size; ps_info_valid = 0; seg_index = 0; while (size > 0 && error == KERN_SUCCESS) { - int abort_size; + unsigned int abort_size; + unsigned int lsize; int failed_size; int beg_pseg; int beg_indx; - vm_offset_t cur_offset; - + dp_offset_t cur_offset; if ( !ps_info_valid) { ps_offset[seg_index] = ps_clmap(vs, vs_offset & ~cl_mask, &clmap, CL_FIND, 0, 0); @@ -2575,30 +3123,16 @@ pvs_cluster_read( /* * skip over unallocated physical segments */ - if (ps_offset[seg_index] == (vm_offset_t) -1) { + if (ps_offset[seg_index] == (dp_offset_t) -1) { abort_size = cl_size - (vs_offset & cl_mask); abort_size = MIN(abort_size, size); - page_list_count = 0; - memory_object_super_upl_request( - vs->vs_control, - (memory_object_offset_t)vs_offset, - abort_size, abort_size, - &upl, NULL, &page_list_count, - request_flags); - - if (clmap.cl_error) { - upl_abort(upl, UPL_ABORT_ERROR); - } else { - upl_abort(upl, UPL_ABORT_UNAVAILABLE); - } - upl_deallocate(upl); - - size -= abort_size; - vs_offset += abort_size; + size -= abort_size; + vs_offset += abort_size; seg_index++; ps_info_valid = 0; + continue; } cl_index = (vs_offset & cl_mask) / vm_page_size; @@ -2612,24 +3146,8 @@ pvs_cluster_read( abort_size += vm_page_size; } if (abort_size) { - /* - * Let VM system know about holes in clusters. - */ - GSTAT(global_stats.gs_pages_unavail += atop(abort_size)); - - page_list_count = 0; - memory_object_super_upl_request( - vs->vs_control, - (memory_object_offset_t)vs_offset, - abort_size, abort_size, - &upl, NULL, &page_list_count, - request_flags); - - upl_abort(upl, UPL_ABORT_UNAVAILABLE); - upl_deallocate(upl); - - size -= abort_size; - vs_offset += abort_size; + size -= abort_size; + vs_offset += abort_size; if (cl_index == pages_in_cl) { /* @@ -2638,6 +3156,7 @@ pvs_cluster_read( */ seg_index++; ps_info_valid = 0; + continue; } if (size == 0) @@ -2660,7 +3179,8 @@ pvs_cluster_read( while (cl_index < pages_in_cl && xfer_size < size) { /* - * accumulate allocated pages within a physical segment + * accumulate allocated pages within + * a physical segment */ if (CLMAP_ISSET(clmap, cl_index)) { xfer_size += vm_page_size; @@ -2674,74 +3194,99 @@ pvs_cluster_read( } if (cl_index < pages_in_cl || xfer_size >= size) { /* - * we've hit an unallocated page or the - * end of this request... go fire the I/O + * we've hit an unallocated page or + * the end of this request... see if + * it's time to fire the I/O */ break; } /* - * we've hit the end of the current physical segment - * and there's more to do, so try moving to the next one + * we've hit the end of the current physical + * segment and there's more to do, so try + * moving to the next one */ seg_index++; ps_offset[seg_index] = ps_clmap(vs, cur_offset & ~cl_mask, &clmap, CL_FIND, 0, 0); - psp[seg_index] = CLMAP_PS(clmap); + psp[seg_index] = CLMAP_PS(clmap); ps_info_valid = 1; if ((ps_offset[seg_index - 1] != (ps_offset[seg_index] - cl_size)) || (psp[seg_index - 1] != psp[seg_index])) { /* - * if the physical segment we're about to step into - * is not contiguous to the one we're currently - * in, or it's in a different paging file, or + * if the physical segment we're about + * to step into is not contiguous to + * the one we're currently in, or it's + * in a different paging file, or * it hasn't been allocated.... - * we stop here and generate the I/O + * we stop this run and go check + * to see if it's time to fire the I/O */ break; } /* - * start with first page of the next physical segment + * start with first page of the next physical + * segment */ cl_index = 0; } - if (xfer_size) { + if (xfer_size == 0) { /* - * we have a contiguous range of allocated pages - * to read from + * no I/O to generate for this segment */ - page_list_count = 0; - memory_object_super_upl_request(vs->vs_control, - (memory_object_offset_t)vs_offset, - xfer_size, xfer_size, - &upl, NULL, &page_list_count, - request_flags | UPL_SET_INTERNAL); - - error = ps_read_file(psp[beg_pseg], upl, (vm_offset_t) 0, - ps_offset[beg_pseg] + (beg_indx * vm_page_size), xfer_size, &residual, 0); - } else continue; + } + if (cur_offset <= orig_vs_offset) { + /* + * we've hit a hole in our speculative cluster + * before the offset that we're really after... + * don't issue the I/O since it doesn't encompass + * the original offset and we're looking to only + * pull in the speculative pages if they can be + * made part of a single I/O + */ + size -= xfer_size; + vs_offset += xfer_size; + + continue; + } + /* + * we have a contiguous range of allocated pages + * to read from that encompasses the original offset + */ + page_list_count = 0; + memory_object_super_upl_request(vs->vs_control, (memory_object_offset_t)vs_offset, + xfer_size, xfer_size, + &upl, NULL, &page_list_count, + request_flags | UPL_SET_INTERNAL); + + error = ps_read_file(psp[beg_pseg], + upl, (upl_offset_t) 0, + ps_offset[beg_pseg] + (beg_indx * vm_page_size), + xfer_size, &residual, io_flags); - failed_size = 0; /* - * Adjust counts and send response to VM. Optimize for the - * common case, i.e. no error and/or partial data. - * If there was an error, then we need to error the entire - * range, even if some data was successfully read. - * If there was a partial read we may supply some + * Adjust counts and send response to VM. Optimize + * for the common case, i.e. no error and/or partial + * data. If there was an error, then we need to error + * the entire range, even if some data was successfully + * read. If there was a partial read we may supply some * data and may error some as well. In all cases the - * VM must receive some notification for every page in the - * range. + * VM must receive some notification for every page + * in the range. */ if ((error == KERN_SUCCESS) && (residual == 0)) { /* - * Got everything we asked for, supply the data to - * the VM. Note that as a side effect of supplying - * the data, the buffer holding the supplied data is - * deallocated from the pager's address space. + * Got everything we asked for, supply the data + * to the VM. Note that as a side effect of + * supplying the data, the buffer holding the + * supplied data is deallocated from the pager's + * address space. */ - pvs_object_data_provided(vs, upl, vs_offset, xfer_size); + lsize = xfer_size; + failed_size = 0; } else { + lsize = 0; failed_size = xfer_size; if (error == KERN_SUCCESS) { @@ -2751,8 +3296,7 @@ pvs_cluster_read( * and no data moved, we turn it into * an error, assuming we're reading at * or beyong EOF. - * Fall through and error the entire - * range. + * Fall through and error the entire range. */ error = KERN_FAILURE; } else { @@ -2765,33 +3309,40 @@ pvs_cluster_read( * Fall through and error the remainder * of the range, if any. */ - int fill, lsize; + int fill; - fill = residual & ~vm_page_size; + fill = residual & (vm_page_size - 1); lsize = (xfer_size - residual) + fill; - pvs_object_data_provided(vs, upl, vs_offset, lsize); - if (lsize < xfer_size) { + if (lsize < xfer_size) failed_size = xfer_size - lsize; + + if (reclaim_all == FALSE) error = KERN_FAILURE; - } } } } - /* - * If there was an error in any part of the range, tell - * the VM. Note that error is explicitly checked again since - * it can be modified above. - */ - if (error != KERN_SUCCESS) { + pvs_object_data_provided(vs, upl, vs_offset, lsize); + + if (failed_size) { + /* + * There was an error in some part of the range, tell + * the VM. Note that error is explicitly checked again + * since it can be modified above. + */ BS_STAT(psp[beg_pseg]->ps_bs, - psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop(failed_size)); + psp[beg_pseg]->ps_bs->bs_pages_in_fail += atop_32(failed_size)); } - size -= xfer_size; - vs_offset += xfer_size; + /* + * we've issued a single I/O that encompassed the original offset + * at this point we either met our speculative request length or + * we ran into a 'hole' (i.e. page not present in the cluster, cluster + * not present or not physically contiguous to the previous one), so + * we're done issuing I/O at this point + */ + return (error); } - - } /* END while (cnt && (error == 0)) */ + } return error; } @@ -2801,63 +3352,92 @@ kern_return_t vs_cluster_write( vstruct_t vs, upl_t internal_upl, - vm_offset_t offset, - vm_size_t cnt, + upl_offset_t offset, + upl_size_t cnt, boolean_t dp_internal, int flags) { - vm_offset_t size; - vm_offset_t transfer_size; + upl_size_t transfer_size; int error = 0; struct clmap clmap; - vm_offset_t actual_offset; /* Offset within paging segment */ + dp_offset_t actual_offset; /* Offset within paging segment */ paging_segment_t ps; - vm_offset_t subx_size; - vm_offset_t mobj_base_addr; - vm_offset_t mobj_target_addr; - int mobj_size; - - struct vs_async *vsa; - vm_map_copy_t copy; + dp_offset_t mobj_base_addr; + dp_offset_t mobj_target_addr; upl_t upl; upl_page_info_t *pl; int page_index; + unsigned int page_max_index; int list_size; - int cl_size; + int pages_in_cl; + unsigned int cl_size; + int base_index; + unsigned int seg_size; + unsigned int upl_offset_in_object; + boolean_t minimal_clustering = FALSE; + boolean_t found_dirty; + + if (!dp_encryption_inited) { + /* + * ENCRYPTED SWAP: + * Once we've started using swap, we + * can't change our mind on whether + * it needs to be encrypted or + * not. + */ + dp_encryption_inited = TRUE; + } + if (dp_encryption) { + /* + * ENCRYPTED SWAP: + * the UPL will need to be encrypted... + */ + flags |= UPL_PAGING_ENCRYPTED; + } + + pages_in_cl = 1 << vs->vs_clshift; + cl_size = pages_in_cl * vm_page_size; +#if CONFIG_FREEZE + minimal_clustering = TRUE; +#else + if (dp_isssd == TRUE) + minimal_clustering = TRUE; +#endif if (!dp_internal) { - int page_list_count; + unsigned int page_list_count; int request_flags; - int super_size; + unsigned int super_size; int first_dirty; int num_dirty; int num_of_pages; int seg_index; - int pages_in_cl; - int must_abort; - vm_offset_t upl_offset; - vm_offset_t seg_offset; - vm_offset_t ps_offset[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT]; - paging_segment_t psp[(VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_DEF_CLSHIFT]; - + upl_offset_t upl_offset; + upl_offset_t upl_offset_aligned; + dp_offset_t seg_offset; + dp_offset_t ps_offset[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1]; + paging_segment_t psp[((VM_SUPER_CLUSTER / PAGE_SIZE) >> VSTRUCT_MIN_CLSHIFT) + 1]; - pages_in_cl = 1 << vs->vs_clshift; - cl_size = pages_in_cl * vm_page_size; - if (bs_low) { + if (bs_low) super_size = cl_size; - - request_flags = UPL_NOBLOCK | - UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM | - UPL_NO_SYNC | UPL_SET_INTERNAL; - } else { + else super_size = VM_SUPER_CLUSTER; - request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE | - UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM | - UPL_NO_SYNC | UPL_SET_INTERNAL; + request_flags = UPL_NOBLOCK | UPL_CLEAN_IN_PLACE | + UPL_RET_ONLY_DIRTY | UPL_COPYOUT_FROM | + UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE; + + if (dp_encryption) { + /* + * ENCRYPTED SWAP: + * request that the UPL be prepared for + * encryption. + */ + request_flags |= UPL_ENCRYPT; + flags |= UPL_PAGING_ENCRYPTED; } page_list_count = 0; @@ -2865,47 +3445,113 @@ vs_cluster_write( (memory_object_offset_t)offset, cnt, super_size, &upl, NULL, &page_list_count, - request_flags | UPL_PAGEOUT); + request_flags | UPL_FOR_PAGEOUT); + + /* + * The default pager does not handle objects larger than + * 4GB, so it does not deal with offset that don't fit in + * 32-bit. Cast down upl->offset now and make sure we + * did not lose any valuable bits. + */ + upl_offset_in_object = (unsigned int) upl->offset; + assert(upl->offset == upl_offset_in_object); pl = UPL_GET_INTERNAL_PAGE_LIST(upl); + seg_size = cl_size - (upl_offset_in_object % cl_size); + upl_offset_aligned = upl_offset_in_object & ~(cl_size - 1); + page_index = 0; + page_max_index = upl->size / PAGE_SIZE; + found_dirty = TRUE; + for (seg_index = 0, transfer_size = upl->size; transfer_size > 0; ) { - ps_offset[seg_index] = ps_clmap(vs, upl->offset + (seg_index * cl_size), - &clmap, CL_ALLOC, - transfer_size < cl_size ? - transfer_size : cl_size, 0); + unsigned int seg_pgcnt; - if (ps_offset[seg_index] == (vm_offset_t) -1) { - upl_abort(upl, 0); - upl_deallocate(upl); - - return KERN_FAILURE; + seg_pgcnt = seg_size / PAGE_SIZE; - } - psp[seg_index] = CLMAP_PS(clmap); + if (minimal_clustering == TRUE) { + unsigned int non_dirty; + + non_dirty = 0; + found_dirty = FALSE; + + for (; non_dirty < seg_pgcnt; non_dirty++) { + if ((page_index + non_dirty) >= page_max_index) + break; - if (transfer_size > cl_size) { - transfer_size -= cl_size; + if (UPL_DIRTY_PAGE(pl, page_index + non_dirty) || + UPL_PRECIOUS_PAGE(pl, page_index + non_dirty)) { + found_dirty = TRUE; + break; + } + } + } + if (found_dirty == TRUE) { + ps_offset[seg_index] = + ps_clmap(vs, + upl_offset_aligned, + &clmap, CL_ALLOC, + cl_size, 0); + + if (ps_offset[seg_index] == (dp_offset_t) -1) { + upl_abort(upl, 0); + upl_deallocate(upl); + + return KERN_FAILURE; + } + psp[seg_index] = CLMAP_PS(clmap); + } + if (transfer_size > seg_size) { + page_index += seg_pgcnt; + transfer_size -= seg_size; + upl_offset_aligned += cl_size; + seg_size = cl_size; seg_index++; } else transfer_size = 0; } - for (page_index = 0, num_of_pages = upl->size / vm_page_size; page_index < num_of_pages; ) { + /* + * Ignore any non-present pages at the end of the + * UPL. + */ + for (page_index = upl->size / vm_page_size; page_index > 0;) { + if (UPL_PAGE_PRESENT(pl, --page_index)) { + page_index++; + break; + } + } + if (page_index == 0) { + /* + * no pages in the UPL + * abort and return + */ + upl_abort(upl, 0); + upl_deallocate(upl); + + return KERN_SUCCESS; + } + num_of_pages = page_index; + + base_index = (upl_offset_in_object % cl_size) / PAGE_SIZE; + + for (page_index = 0; page_index < num_of_pages; ) { /* * skip over non-dirty pages */ for ( ; page_index < num_of_pages; page_index++) { - if (UPL_DIRTY_PAGE(pl, page_index) || UPL_PRECIOUS_PAGE(pl, page_index)) + if (UPL_DIRTY_PAGE(pl, page_index) + || UPL_PRECIOUS_PAGE(pl, page_index)) /* * this is a page we need to write - * go see if we can buddy it up with others - * that are contiguous to it + * go see if we can buddy it up with + * others that are contiguous to it */ break; /* - * if the page is not-dirty, but present we need to commit it... - * this is an unusual case since we only asked for dirty pages + * if the page is not-dirty, but present we + * need to commit it... This is an unusual + * case since we only asked for dirty pages */ if (UPL_PAGE_PRESENT(pl, page_index)) { boolean_t empty = FALSE; @@ -2916,8 +3562,11 @@ vs_cluster_write( pl, page_list_count, &empty); - if (empty) + if (empty) { + assert(page_index == + num_of_pages - 1); upl_deallocate(upl); + } } } if (page_index == num_of_pages) @@ -2927,14 +3576,16 @@ vs_cluster_write( break; /* - * gather up contiguous dirty pages... we have at least 1 - * otherwise we would have bailed above + * gather up contiguous dirty pages... we have at + * least 1 * otherwise we would have bailed above * make sure that each physical segment that we step * into is contiguous to the one we're currently in * if it's not, we have to stop and write what we have */ - for (first_dirty = page_index; page_index < num_of_pages; ) { - if ( !UPL_DIRTY_PAGE(pl, page_index) && !UPL_PRECIOUS_PAGE(pl, page_index)) + for (first_dirty = page_index; + page_index < num_of_pages; ) { + if ( !UPL_DIRTY_PAGE(pl, page_index) + && !UPL_PRECIOUS_PAGE(pl, page_index)) break; page_index++; /* @@ -2946,62 +3597,63 @@ vs_cluster_write( int cur_seg; int nxt_seg; - cur_seg = (page_index - 1) / pages_in_cl; - nxt_seg = page_index / pages_in_cl; + cur_seg = (base_index + (page_index - 1))/pages_in_cl; + nxt_seg = (base_index + page_index)/pages_in_cl; if (cur_seg != nxt_seg) { if ((ps_offset[cur_seg] != (ps_offset[nxt_seg] - cl_size)) || (psp[cur_seg] != psp[nxt_seg])) - /* - * if the segment we're about to step into - * is not contiguous to the one we're currently - * in, or it's in a different paging file.... - * we stop here and generate the I/O - */ + /* + * if the segment we're about + * to step into is not + * contiguous to the one we're + * currently in, or it's in a + * different paging file.... + * we stop here and generate + * the I/O + */ break; } } } num_dirty = page_index - first_dirty; - must_abort = 1; if (num_dirty) { upl_offset = first_dirty * vm_page_size; - seg_index = first_dirty / pages_in_cl; - seg_offset = upl_offset - (seg_index * cl_size); transfer_size = num_dirty * vm_page_size; - error = ps_write_file(psp[seg_index], upl, upl_offset, - ps_offset[seg_index] + seg_offset, transfer_size, flags); - - if (error == 0) { - while (transfer_size) { - int seg_size; + while (transfer_size) { - if ((seg_size = cl_size - (upl_offset % cl_size)) > transfer_size) - seg_size = transfer_size; + if ((seg_size = cl_size - + ((upl_offset_in_object + + upl_offset) % cl_size)) + > transfer_size) + seg_size = transfer_size; - ps_vs_write_complete(vs, upl->offset + upl_offset, seg_size, error); + ps_vs_write_complete( + vs, + (upl_offset_in_object + + upl_offset), + seg_size, error); - transfer_size -= seg_size; - upl_offset += seg_size; - } + transfer_size -= seg_size; + upl_offset += seg_size; } - must_abort = 0; - } - if (must_abort) { - boolean_t empty = FALSE; - upl_abort_range(upl, - first_dirty * vm_page_size, - num_dirty * vm_page_size, - UPL_ABORT_NOTIFY_EMPTY, - &empty); - if (empty) - upl_deallocate(upl); + upl_offset = first_dirty * vm_page_size; + transfer_size = num_dirty * vm_page_size; + + seg_index = (base_index + first_dirty) / pages_in_cl; + seg_offset = (upl_offset_in_object + upl_offset) % cl_size; + + error = ps_write_file(psp[seg_index], + upl, upl_offset, + ps_offset[seg_index] + + seg_offset, + transfer_size, flags); } } } else { - assert(cnt <= (vm_page_size << vs->vs_clshift)); + assert(cnt <= (unsigned) (vm_page_size << vs->vs_clshift)); list_size = cnt; page_index = 0; @@ -3018,23 +3670,23 @@ vs_cluster_write( &clmap, CL_ALLOC, transfer_size < cl_size ? transfer_size : cl_size, 0); - if(actual_offset == (vm_offset_t) -1) { + if(actual_offset == (dp_offset_t) -1) { error = 1; break; } cnt = MIN(transfer_size, - CLMAP_NPGS(clmap) * vm_page_size); + (unsigned) CLMAP_NPGS(clmap) * vm_page_size); ps = CLMAP_PS(clmap); /* Assume that the caller has given us contiguous */ /* pages */ if(cnt) { + ps_vs_write_complete(vs, mobj_target_addr, + cnt, error); error = ps_write_file(ps, internal_upl, 0, actual_offset, cnt, flags); if (error) break; - ps_vs_write_complete(vs, mobj_target_addr, - cnt, error); } if (error) break; @@ -3059,7 +3711,7 @@ ps_vstruct_allocated_size( { int num_pages; struct vs_map *vsmap; - int i, j, k; + unsigned int i, j, k; num_pages = 0; if (vs->vs_indirect) { @@ -3097,19 +3749,19 @@ ps_vstruct_allocated_size( } } - return ptoa(num_pages); + return ptoa_32(num_pages); } -size_t +unsigned int ps_vstruct_allocated_pages( vstruct_t vs, default_pager_page_t *pages, - size_t pages_size) + unsigned int pages_size) { - int num_pages; + unsigned int num_pages; struct vs_map *vsmap; - vm_offset_t offset; - int i, j, k; + dp_offset_t offset; + unsigned int i, j, k; num_pages = 0; offset = 0; @@ -3175,9 +3827,9 @@ ps_vstruct_transfer_from_segment( upl_t upl) { struct vs_map *vsmap; - struct vs_map old_vsmap; - struct vs_map new_vsmap; - int i, j, k; +// struct vs_map old_vsmap; +// struct vs_map new_vsmap; + unsigned int i, j; VS_LOCK(vs); /* block all work on this vstruct */ /* can't allow the normal multiple write */ @@ -3200,8 +3852,8 @@ ps_vstruct_transfer_from_segment( VS_UNLOCK(vs); vs_changed: if (vs->vs_indirect) { - int vsmap_size; - int clmap_off; + unsigned int vsmap_size; + int clmap_off; /* loop on indirect maps */ for (i = 0; i < INDIRECT_CLMAP_ENTRIES(vs->vs_size); i++) { vsmap = vs->vs_imap[i]; @@ -3236,6 +3888,14 @@ vs_changed: vs->vs_xfer_pending = FALSE; VS_UNLOCK(vs); vs_finish_write(vs); + + if (backing_store_abort_compaction || backing_store_stop_compaction) { + backing_store_abort_compaction = FALSE; + dprintf(("ps_vstruct_transfer_from_segment - ABORTED\n")); + return KERN_FAILURE; + } + vnode_pager_throttle(); + VS_LOCK(vs); vs->vs_xfer_pending = TRUE; vs_wait_for_sync_writers(vs); @@ -3279,10 +3939,10 @@ vs_changed: vs_finish_write(vs); VS_LOCK(vs); vs->vs_xfer_pending = TRUE; - VS_UNLOCK(vs); vs_wait_for_sync_writers(vs); vs_start_write(vs); vs_wait_for_readers(vs); + VS_UNLOCK(vs); if (vs->vs_indirect) { goto vs_changed; } @@ -3301,12 +3961,12 @@ vs_changed: vs_map_t vs_get_map_entry( vstruct_t vs, - vm_offset_t offset) + dp_offset_t offset) { struct vs_map *vsmap; - vm_offset_t cluster; + dp_offset_t cluster; - cluster = atop(offset) >> vs->vs_clshift; + cluster = atop_32(offset) >> vs->vs_clshift; if (vs->vs_indirect) { long ind_block = cluster/CLMAP_ENTRIES; @@ -3323,25 +3983,26 @@ vs_get_map_entry( kern_return_t vs_cluster_transfer( vstruct_t vs, - vm_offset_t offset, - vm_size_t cnt, + dp_offset_t offset, + dp_size_t cnt, upl_t upl) { - vm_offset_t actual_offset; + dp_offset_t actual_offset; paging_segment_t ps; struct clmap clmap; kern_return_t error = KERN_SUCCESS; - int size, size_wanted, i; - unsigned int residual; - int unavail_size; - default_pager_thread_t *dpt; - boolean_t dealloc; - struct vs_map *vsmap_ptr; + unsigned int size, size_wanted; + int i; + unsigned int residual = 0; + unsigned int unavail_size; +// default_pager_thread_t *dpt; +// boolean_t dealloc; + struct vs_map *vsmap_ptr = NULL; struct vs_map read_vsmap; struct vs_map original_read_vsmap; struct vs_map write_vsmap; - upl_t sync_upl; - vm_offset_t ioaddr; +// upl_t sync_upl; +// vm_offset_t ioaddr; /* vs_cluster_transfer reads in the pages of a cluster and * then writes these pages back to new backing store. The @@ -3372,7 +4033,7 @@ vs_cluster_transfer( vsmap_ptr = vs_get_map_entry(vs, offset); actual_offset = ps_clmap(vs, offset, &clmap, CL_FIND, 0, 0); - if (actual_offset == (vm_offset_t) -1) { + if (actual_offset == (dp_offset_t) -1) { /* * Nothing left to write in this cluster at least @@ -3425,6 +4086,7 @@ vs_cluster_transfer( if (size == 0) { ASSERT(unavail_size); + ps_clunmap(vs, offset, unavail_size); cnt -= unavail_size; offset += unavail_size; if((offset & ((vm_page_size << vs->vs_clshift) - 1)) @@ -3443,6 +4105,10 @@ vs_cluster_transfer( original_read_vsmap = *vsmap_ptr; if(ps->ps_segtype == PS_PARTITION) { + panic("swap partition not supported\n"); + /*NOTREACHED*/ + error = KERN_FAILURE; + residual = size; /* NEED TO ISSUE WITH SYNC & NO COMMIT error = ps_read_device(ps, actual_offset, &buffer, @@ -3450,9 +4116,9 @@ vs_cluster_transfer( */ } else { /* NEED TO ISSUE WITH SYNC & NO COMMIT */ - error = ps_read_file(ps, upl, (vm_offset_t) 0, actual_offset, + error = ps_read_file(ps, upl, (upl_offset_t) 0, actual_offset, size, &residual, - (UPL_IOSYNC | UPL_NOCOMMIT)); + (UPL_IOSYNC | UPL_NOCOMMIT | (dp_encryption ? UPL_PAGING_ENCRYPTED : 0))); } read_vsmap = *vsmap_ptr; @@ -3466,7 +4132,6 @@ vs_cluster_transfer( * */ if ((error == KERN_SUCCESS) && (residual == 0)) { - int page_list_count = 0; /* * Got everything we asked for, supply the data to @@ -3500,6 +4165,7 @@ vs_cluster_transfer( */ write_vsmap = *vsmap_ptr; *vsmap_ptr = read_vsmap; + ps_clunmap(vs, offset, size); } else { /* discard the old backing object */ write_vsmap = *vsmap_ptr; @@ -3561,14 +4227,16 @@ vs_cluster_transfer( } kern_return_t -default_pager_add_file(MACH_PORT_FACE backing_store, - int *vp, +default_pager_add_file( + MACH_PORT_FACE backing_store, + vnode_ptr_t vp, int record_size, - long size) + vm_size_t size) { backing_store_t bs; paging_segment_t ps; int i; + unsigned int j; int error; if ((bs = backing_store_lookup(backing_store)) @@ -3607,31 +4275,44 @@ default_pager_add_file(MACH_PORT_FACE backing_store, ps->ps_vnode = (struct vnode *)vp; ps->ps_offset = 0; ps->ps_record_shift = local_log2(vm_page_size / record_size); - ps->ps_recnum = size; - ps->ps_pgnum = size >> ps->ps_record_shift; + assert((dp_size_t) size == size); + ps->ps_recnum = (dp_size_t) size; + ps->ps_pgnum = ((dp_size_t) size) >> ps->ps_record_shift; ps->ps_pgcount = ps->ps_pgnum; ps->ps_clshift = local_log2(bs->bs_clsize); ps->ps_clcount = ps->ps_ncls = ps->ps_pgcount >> ps->ps_clshift; + ps->ps_special_clusters = 0; ps->ps_hint = 0; PS_LOCK_INIT(ps); ps->ps_bmap = (unsigned char *) kalloc(RMAPSIZE(ps->ps_ncls)); if (!ps->ps_bmap) { - kfree((vm_offset_t)ps, sizeof *ps); + PS_LOCK_DESTROY(ps); + kfree(ps, sizeof *ps); BS_UNLOCK(bs); return KERN_RESOURCE_SHORTAGE; } - for (i = 0; i < ps->ps_ncls; i++) { - clrbit(ps->ps_bmap, i); + for (j = 0; j < ps->ps_ncls; j++) { + clrbit(ps->ps_bmap, j); } - ps->ps_going_away = FALSE; + if(paging_segment_count == 0) { + ps->ps_state = PS_EMERGENCY_SEGMENT; + if(use_emergency_swap_file_first) { + ps->ps_state |= PS_CAN_USE; + } + emergency_segment_backing_store = backing_store; + } else { + ps->ps_state = PS_CAN_USE; + } + ps->ps_bs = bs; if ((error = ps_enter(ps)) != 0) { - kfree((vm_offset_t)ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); - kfree((vm_offset_t)ps, sizeof *ps); + kfree(ps->ps_bmap, RMAPSIZE(ps->ps_ncls)); + PS_LOCK_DESTROY(ps); + kfree(ps, sizeof *ps); BS_UNLOCK(bs); return KERN_RESOURCE_SHORTAGE; } @@ -3639,17 +4320,42 @@ default_pager_add_file(MACH_PORT_FACE backing_store, bs->bs_pages_free += ps->ps_clcount << ps->ps_clshift; bs->bs_pages_total += ps->ps_clcount << ps->ps_clshift; PSL_LOCK(); - dp_pages_free += ps->ps_pgcount; + if(IS_PS_OK_TO_USE(ps)) { + dp_pages_free += ps->ps_pgcount; + } else { + dp_pages_reserve += ps->ps_pgcount; + } PSL_UNLOCK(); BS_UNLOCK(bs); bs_more_space(ps->ps_clcount); - DEBUG(DEBUG_BS_INTERNAL, - ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n", - device, offset, size, record_size, - ps->ps_record_shift, ps->ps_pgnum)); + /* + * If the paging segment being activated is not the emergency + * segment and we notice that the emergency segment is being + * used then we help recover it. If all goes well, the + * emergency segment will be back to its original state of + * online but not activated (till it's needed the next time). + */ +#if CONFIG_FREEZE + if (!memorystatus_freeze_enabled) +#endif + { + ps = paging_segments[EMERGENCY_PSEG_INDEX]; + if(IS_PS_EMERGENCY_SEGMENT(ps) && IS_PS_OK_TO_USE(ps)) { + if(default_pager_backing_store_delete(emergency_segment_backing_store)) { + dprintf(("Failed to recover emergency paging segment\n")); + } else { + dprintf(("Recovered emergency paging segment\n")); + } + } + } + + DP_DEBUG(DEBUG_BS_INTERNAL, + ("device=0x%x,offset=0x%x,count=0x%x,record_size=0x%x,shift=%d,total_size=0x%x\n", + device, offset, (dp_size_t) size, record_size, + ps->ps_record_shift, ps->ps_pgnum)); return KERN_SUCCESS; } @@ -3660,9 +4366,9 @@ kern_return_t ps_read_file( paging_segment_t ps, upl_t upl, - vm_offset_t upl_offset, - vm_offset_t offset, - unsigned int size, + upl_offset_t upl_offset, + dp_offset_t offset, + upl_size_t size, unsigned int *residualp, int flags) { @@ -3670,14 +4376,17 @@ ps_read_file( int error = 0; int result; + assert(dp_encryption_inited); - clustered_reads[atop(size)]++; + clustered_reads[atop_32(size)]++; f_offset = (vm_object_offset_t)(ps->ps_offset + offset); - /* for transfer case we need to pass uploffset and flags */ - error = vnode_pagein(ps->ps_vnode, - upl, upl_offset, f_offset, (vm_size_t)size, flags | UPL_NORDAHEAD, NULL); + /* + * for transfer case we need to pass uploffset and flags + */ + assert((upl_size_t) size == size); + error = vnode_pagein(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL); /* The vnode_pagein semantic is somewhat at odds with the existing */ /* device_read semantic. Partial reads are not experienced at this */ @@ -3699,21 +4408,29 @@ kern_return_t ps_write_file( paging_segment_t ps, upl_t upl, - vm_offset_t upl_offset, - vm_offset_t offset, + upl_offset_t upl_offset, + dp_offset_t offset, unsigned int size, int flags) { vm_object_offset_t f_offset; kern_return_t result; - int error = 0; + assert(dp_encryption_inited); - clustered_writes[atop(size)]++; + clustered_writes[atop_32(size)]++; f_offset = (vm_object_offset_t)(ps->ps_offset + offset); - if (vnode_pageout(ps->ps_vnode, - upl, upl_offset, f_offset, (vm_size_t)size, flags, NULL)) + if (flags & UPL_PAGING_ENCRYPTED) { + /* + * ENCRYPTED SWAP: + * encrypt all the pages that we're going + * to pageout. + */ + upl_encrypt(upl, upl_offset, size); + } + assert((upl_size_t) size == size); + if (vnode_pageout(ps->ps_vnode, upl, upl_offset, f_offset, (upl_size_t)size, flags, NULL)) result = KERN_FAILURE; else result = KERN_SUCCESS; @@ -3721,28 +4438,109 @@ ps_write_file( return result; } +static inline void ps_vnode_trim_init(struct ps_vnode_trim_data *data) +{ +#pragma unused(data) +} + +static inline void ps_vnode_trim_now(struct ps_vnode_trim_data *data) +{ +#pragma unused(data) +} + +static inline void ps_vnode_trim_more(struct ps_vnode_trim_data *data, struct vs_map *map, unsigned int shift, dp_size_t length) +{ +#pragma unused(data, map, shift, length) +} + kern_return_t -default_pager_triggers(MACH_PORT_FACE default_pager, +default_pager_triggers( __unused MACH_PORT_FACE default_pager, int hi_wat, int lo_wat, int flags, MACH_PORT_FACE trigger_port) { - MACH_PORT_FACE release; + MACH_PORT_FACE release = IPC_PORT_NULL; kern_return_t kr; + clock_sec_t now; + clock_nsec_t nanoseconds_dummy; + static clock_sec_t error_notify = 0; PSL_LOCK(); - if (flags == HI_WAT_ALERT) { + if (flags == SWAP_ENCRYPT_ON) { + /* ENCRYPTED SWAP: turn encryption on */ + release = trigger_port; + if (!dp_encryption_inited) { + dp_encryption_inited = TRUE; + dp_encryption = TRUE; + kr = KERN_SUCCESS; + } else { + kr = KERN_FAILURE; + } + } else if (flags == SWAP_ENCRYPT_OFF) { + /* ENCRYPTED SWAP: turn encryption off */ + release = trigger_port; + if (!dp_encryption_inited) { + dp_encryption_inited = TRUE; + dp_encryption = FALSE; + kr = KERN_SUCCESS; + } else { + kr = KERN_FAILURE; + } + } else if (flags == HI_WAT_ALERT) { release = min_pages_trigger_port; - min_pages_trigger_port = trigger_port; - minimum_pages_remaining = hi_wat/vm_page_size; - bs_low = FALSE; - kr = KERN_SUCCESS; +#if CONFIG_FREEZE + /* High and low water signals aren't applicable when freeze is */ + /* enabled, so release the trigger ports here and return */ + /* KERN_FAILURE. */ + if (memorystatus_freeze_enabled) { + if (IP_VALID( trigger_port )){ + ipc_port_release_send( trigger_port ); + } + min_pages_trigger_port = IPC_PORT_NULL; + kr = KERN_FAILURE; + } + else +#endif + { + min_pages_trigger_port = trigger_port; + minimum_pages_remaining = hi_wat/vm_page_size; + bs_low = FALSE; + kr = KERN_SUCCESS; + } } else if (flags == LO_WAT_ALERT) { release = max_pages_trigger_port; - max_pages_trigger_port = trigger_port; - maximum_pages_free = lo_wat/vm_page_size; +#if CONFIG_FREEZE + if (memorystatus_freeze_enabled) { + if (IP_VALID( trigger_port )){ + ipc_port_release_send( trigger_port ); + } + max_pages_trigger_port = IPC_PORT_NULL; + kr = KERN_FAILURE; + } + else +#endif + { + max_pages_trigger_port = trigger_port; + maximum_pages_free = lo_wat/vm_page_size; + kr = KERN_SUCCESS; + } + } else if (flags == USE_EMERGENCY_SWAP_FILE_FIRST) { + use_emergency_swap_file_first = TRUE; + release = trigger_port; kr = KERN_SUCCESS; + } else if (flags == SWAP_FILE_CREATION_ERROR) { + release = trigger_port; + kr = KERN_SUCCESS; + if( paging_segment_count == 1) { + use_emergency_swap_file_first = TRUE; + } + no_paging_space_action(); + clock_get_system_nanotime(&now, &nanoseconds_dummy); + if (now > error_notify + 5) { + dprintf(("Swap File Error.\n")); + error_notify = now; + } } else { release = trigger_port; kr = KERN_INVALID_ARGUMENT; @@ -3754,3 +4552,83 @@ default_pager_triggers(MACH_PORT_FACE default_pager, return kr; } + +/* + * Monitor the amount of available backing store vs. the amount of + * required backing store, notify a listener (if present) when + * backing store may safely be removed. + * + * We attempt to avoid the situation where backing store is + * discarded en masse, as this can lead to thrashing as the + * backing store is compacted. + */ + +#define PF_INTERVAL 3 /* time between free level checks */ +#define PF_LATENCY 10 /* number of intervals before release */ + +static int dp_pages_free_low_count = 0; +thread_call_t default_pager_backing_store_monitor_callout; + +void +default_pager_backing_store_monitor(__unused thread_call_param_t p1, + __unused thread_call_param_t p2) +{ +// unsigned long long average; + ipc_port_t trigger; + uint64_t deadline; + + /* + * We determine whether it will be safe to release some + * backing store by watching the free page level. If + * it remains below the maximum_pages_free threshold for + * at least PF_LATENCY checks (taken at PF_INTERVAL seconds) + * then we deem it safe. + * + * Note that this establishes a maximum rate at which backing + * store will be released, as each notification (currently) + * only results in a single backing store object being + * released. + */ + if (dp_pages_free > maximum_pages_free) { + dp_pages_free_low_count++; + } else { + dp_pages_free_low_count = 0; + } + + /* decide whether to send notification */ + trigger = IP_NULL; + if (max_pages_trigger_port && + (backing_store_release_trigger_disable == 0) && + (dp_pages_free_low_count > PF_LATENCY)) { + trigger = max_pages_trigger_port; + max_pages_trigger_port = NULL; + } + + /* send notification */ + if (trigger != IP_NULL) { + VSL_LOCK(); + if(backing_store_release_trigger_disable != 0) { + assert_wait((event_t) + &backing_store_release_trigger_disable, + THREAD_UNINT); + VSL_UNLOCK(); + thread_block(THREAD_CONTINUE_NULL); + } else { + VSL_UNLOCK(); + } + dprintf(("default_pager_backing_store_monitor - send LO_WAT_ALERT\n")); + + default_pager_space_alert(trigger, LO_WAT_ALERT); + ipc_port_release_send(trigger); + dp_pages_free_low_count = 0; + } + + clock_interval_to_deadline(PF_INTERVAL, NSEC_PER_SEC, &deadline); + thread_call_enter_delayed(default_pager_backing_store_monitor_callout, deadline); +} + +#if CONFIG_FREEZE +unsigned int default_pager_swap_pages_free() { + return dp_pages_free; +} +#endif