X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/0a7de7458d150b5d4dffc935ba399be265ef0a1a..refs/heads/master:/osfmk/vm/vm_pageout.c diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 6b6e3d04d..416921ce3 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -64,6 +64,7 @@ */ #include +#include #include #include @@ -81,14 +82,14 @@ #include #include -#include +#include #include #include #include #include #include -#include #include +#include #include #include @@ -137,19 +138,24 @@ extern unsigned int memorystatus_frozen_count; extern unsigned int memorystatus_suspended_count; extern vm_pressure_level_t memorystatus_vm_pressure_level; +extern lck_mtx_t memorystatus_jetsam_fg_band_lock; +extern uint32_t memorystatus_jetsam_fg_band_waiters; + void vm_pressure_response(void); extern void consider_vm_pressure_events(void); #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4 #endif /* VM_PRESSURE_EVENTS */ +thread_t vm_pageout_scan_thread = THREAD_NULL; +boolean_t vps_dynamic_priority_enabled = FALSE; #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */ -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024 -#else +#else /* !XNU_TARGET_OS_OSX */ #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096 -#endif +#endif /* !XNU_TARGET_OS_OSX */ #endif #ifndef VM_PAGEOUT_DEADLOCK_RELIEF @@ -208,11 +214,11 @@ extern void consider_vm_pressure_events(void); */ #ifndef VM_PAGE_FREE_TARGET -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100) -#else +#else /* !XNU_TARGET_OS_OSX */ #define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80) -#endif +#endif /* !XNU_TARGET_OS_OSX */ #endif /* VM_PAGE_FREE_TARGET */ @@ -222,22 +228,22 @@ extern void consider_vm_pressure_events(void); */ #ifndef VM_PAGE_FREE_MIN -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define VM_PAGE_FREE_MIN(free) (10 + (free) / 200) -#else +#else /* !XNU_TARGET_OS_OSX */ #define VM_PAGE_FREE_MIN(free) (10 + (free) / 100) -#endif +#endif /* !XNU_TARGET_OS_OSX */ #endif /* VM_PAGE_FREE_MIN */ -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define VM_PAGE_FREE_RESERVED_LIMIT 100 #define VM_PAGE_FREE_MIN_LIMIT 1500 #define VM_PAGE_FREE_TARGET_LIMIT 2000 -#else +#else /* !XNU_TARGET_OS_OSX */ #define VM_PAGE_FREE_RESERVED_LIMIT 1700 #define VM_PAGE_FREE_MIN_LIMIT 3500 #define VM_PAGE_FREE_TARGET_LIMIT 4000 -#endif +#endif /* !XNU_TARGET_OS_OSX */ /* * When vm_page_free_count falls below vm_page_free_reserved, @@ -263,11 +269,11 @@ extern void consider_vm_pressure_events(void); #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000 #ifndef VM_PAGE_REACTIVATE_LIMIT -#ifdef CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2) -#else +#else /* !XNU_TARGET_OS_OSX */ #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX)) -#endif +#endif /* !XNU_TARGET_OS_OSX */ #endif /* VM_PAGE_REACTIVATE_LIMIT */ #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000 @@ -304,12 +310,14 @@ static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *, boolean_t extern void vm_pageout_continue(void); extern void vm_pageout_scan(void); -void vm_tests(void); /* forward */ +boolean_t vm_pageout_running = FALSE; + +uint32_t vm_page_upl_tainted = 0; +uint32_t vm_page_iopl_tainted = 0; -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX static boolean_t vm_pageout_waiter = FALSE; -static boolean_t vm_pageout_running = FALSE; -#endif /* !CONFIG_EMBEDDED */ +#endif /* XNU_TARGET_OS_OSX */ #if DEVELOPMENT || DEBUG @@ -319,8 +327,8 @@ struct vm_pageout_vminfo vm_pageout_vminfo; struct vm_pageout_state vm_pageout_state; struct vm_config vm_config; -struct vm_pageout_queue vm_pageout_queue_internal __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); -struct vm_pageout_queue vm_pageout_queue_external __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); +struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED; +struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED; int vm_upl_wait_for_pages = 0; vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL; @@ -329,7 +337,7 @@ boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL; int vm_debug_events = 0; -lck_grp_t vm_pageout_lck_grp; +LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout"); #if CONFIG_MEMORYSTATUS extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async); @@ -339,6 +347,11 @@ uint32_t vm_pageout_memorystatus_fb_factor_dr = 2; #endif +#if __AMP__ +int vm_compressor_ebound = 1; +int vm_pgo_pbound = 0; +extern void thread_bind_cluster_type(thread_t, char, bool); +#endif /* __AMP__ */ /* @@ -433,7 +446,7 @@ vm_pageout_object_terminate( if (m->vmp_dirty) { vm_page_unwire(m, TRUE); /* reactivates */ - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); PAGE_WAKEUP_DONE(m); } else { vm_page_free(m); /* clears busy, etc. */ @@ -529,11 +542,6 @@ vm_pageclean_setup( assert(!m->vmp_cleaning); #endif - XPR(XPR_VM_PAGEOUT, - "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n", - VM_PAGE_OBJECT(m), m->vmp_offset, m, - new_m, new_offset); - pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m)); /* @@ -589,10 +597,6 @@ vm_pageout_initialize_page( vm_object_offset_t paging_offset; memory_object_t pager; - XPR(XPR_VM_PAGEOUT, - "vm_pageout_initialize_page, page 0x%X\n", - m, 0, 0, 0, 0); - assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); object = VM_PAGE_OBJECT(m); @@ -699,11 +703,6 @@ vm_pageout_cluster(vm_page_t m) vm_object_t object = VM_PAGE_OBJECT(m); struct vm_pageout_queue *q; - - XPR(XPR_VM_PAGEOUT, - "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n", - object, m->vmp_offset, m, 0, 0); - VM_PAGE_CHECK(m); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); vm_object_lock_assert_exclusive(object); @@ -1546,7 +1545,8 @@ extern struct memory_object_pager_ops shared_region_pager_ops; void update_vm_info(void) { - uint64_t tmp; + unsigned long tmp; + uint64_t tmp64; vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count; vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count; @@ -1567,17 +1567,17 @@ update_vm_info(void) vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page); last.vm_pageout_considered_page = tmp; - tmp = vm_pageout_vminfo.vm_pageout_compressions; - vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp - last.vm_pageout_compressions); - last.vm_pageout_compressions = tmp; + tmp64 = vm_pageout_vminfo.vm_pageout_compressions; + vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions); + last.vm_pageout_compressions = tmp64; tmp = vm_pageout_vminfo.vm_compressor_failed; vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed); last.vm_compressor_failed = tmp; - tmp = vm_pageout_vminfo.vm_compressor_pages_grabbed; - vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp - last.vm_compressor_pages_grabbed); - last.vm_compressor_pages_grabbed = tmp; + tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed; + vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed); + last.vm_compressor_pages_grabbed = tmp64; tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost; vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost); @@ -1587,9 +1587,9 @@ update_vm_info(void) vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost); last.vm_phantom_cache_added_ghost = tmp; - tmp = get_pages_grabbed_count(); - vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp - last_vm_page_pages_grabbed); - last_vm_page_pages_grabbed = tmp; + tmp64 = counter_load(&vm_page_grab_count); + vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed); + last_vm_page_pages_grabbed = tmp64; tmp = vm_pageout_vminfo.vm_page_pages_freed; vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed); @@ -1741,694 +1741,1353 @@ update_vm_info(void) extern boolean_t hibernation_vmqueues_inspection; -void -vm_page_balance_inactive(int max_to_move) -{ - vm_page_t m; +/* + * Return values for functions called by vm_pageout_scan + * that control its flow. + * + * PROCEED -- vm_pageout_scan will keep making forward progress. + * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns. + * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue. + */ - LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); +#define VM_PAGEOUT_SCAN_PROCEED (0) +#define VM_PAGEOUT_SCAN_DONE_RETURN (1) +#define VM_PAGEOUT_SCAN_NEXT_ITERATION (2) + +/* + * This function is called only from vm_pageout_scan and + * it moves overflow secluded pages (one-at-a-time) to the + * batched 'local' free Q or active Q. + */ +static void +vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed) +{ +#if CONFIG_SECLUDED_MEMORY + /* + * Deal with secluded_q overflow. + */ + if (vm_page_secluded_count > vm_page_secluded_target) { + vm_page_t secluded_page; - if (hibernation_vmqueues_inspection == TRUE) { /* - * It is likely that the hibernation code path is - * dealing with these very queues as we are about - * to move pages around in/from them and completely - * change the linkage of the pages. - * - * And so we skip the rebalancing of these queues. + * SECLUDED_AGING_BEFORE_ACTIVE: + * Excess secluded pages go to the active queue and + * will later go to the inactive queue. */ - return; + assert((vm_page_secluded_count_free + + vm_page_secluded_count_inuse) == + vm_page_secluded_count); + secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded); + assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q); + + vm_page_queues_remove(secluded_page, FALSE); + assert(!secluded_page->vmp_fictitious); + assert(!VM_PAGE_WIRED(secluded_page)); + + if (secluded_page->vmp_object == 0) { + /* transfer to free queue */ + assert(secluded_page->vmp_busy); + secluded_page->vmp_snext = *local_freeq; + *local_freeq = secluded_page; + *local_freed += 1; + } else { + /* transfer to head of active queue */ + vm_page_enqueue_active(secluded_page, FALSE); + secluded_page = VM_PAGE_NULL; + } } - vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count + - vm_page_inactive_count + - vm_page_speculative_count); +#else /* CONFIG_SECLUDED_MEMORY */ - while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) { - VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1); - - m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); +#pragma unused(local_freeq) +#pragma unused(local_freed) - assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q); - assert(!m->vmp_laundry); - assert(VM_PAGE_OBJECT(m) != kernel_object); - assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); + return; - DTRACE_VM2(scan, int, 1, (uint64_t *), NULL); +#endif /* CONFIG_SECLUDED_MEMORY */ +} - /* - * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise... - * - * a TLB flush isn't really needed here since at worst we'll miss the reference bit being - * updated in the PTE if a remote processor still has this mapping cached in its TLB when the - * new reference happens. If no futher references happen on the page after that remote TLB flushes - * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue - * by pageout_scan, which is just fine since the last reference would have happened quite far - * in the past (TLB caches don't hang around for very long), and of course could just as easily - * have happened before we moved the page - */ - if (m->vmp_pmapped == TRUE) { - pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); - } +/* + * This function is called only from vm_pageout_scan and + * it initializes the loop targets for vm_pageout_scan(). + */ +static void +vps_init_page_targets(void) +{ + /* + * LD TODO: Other page targets should be calculated here too. + */ + vm_page_anonymous_min = vm_page_inactive_target / 20; - /* - * The page might be absent or busy, - * but vm_page_deactivate can handle that. - * FALSE indicates that we don't want a H/W clear reference - */ - vm_page_deactivate_internal(m, FALSE); + if (vm_pageout_state.vm_page_speculative_percentage > 50) { + vm_pageout_state.vm_page_speculative_percentage = 50; + } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) { + vm_pageout_state.vm_page_speculative_percentage = 1; } -} + vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count + + vm_page_inactive_count); +} /* - * vm_pageout_scan does the dirty work for the pageout daemon. - * It returns with both vm_page_queue_free_lock and vm_page_queue_lock - * held and vm_page_free_wanted == 0. + * This function is called only from vm_pageout_scan and + * it purges a single VM object at-a-time and will either + * make vm_pageout_scan() restart the loop or keeping moving forward. */ -void -vm_pageout_scan(void) +static int +vps_purge_object() { - unsigned int loop_count = 0; - unsigned int inactive_burst_count = 0; - unsigned int reactivated_this_call; - unsigned int reactivate_limit; - vm_page_t local_freeq = NULL; - int local_freed = 0; - int delayed_unlock; - int delayed_unlock_limit = 0; - int refmod_state = 0; - int vm_pageout_deadlock_target = 0; - struct vm_pageout_queue *iq; - struct vm_pageout_queue *eq; - struct vm_speculative_age_q *sq; - struct flow_control flow_control = { 0, { 0, 0 } }; - boolean_t inactive_throttled = FALSE; - mach_timespec_t ts; - unsigned int msecs = 0; - vm_object_t object = NULL; - uint32_t inactive_reclaim_run; - boolean_t exceeded_burst_throttle; - boolean_t grab_anonymous = FALSE; - boolean_t force_anonymous = FALSE; - boolean_t force_speculative_aging = FALSE; - int anons_grabbed = 0; - int page_prev_q_state = 0; -#if CONFIG_BACKGROUND_QUEUE - boolean_t page_from_bg_q = FALSE; -#endif - int cache_evict_throttle = 0; - uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0; - uint32_t inactive_external_count; - int force_purge = 0; - int divisor; -#define DELAY_SPECULATIVE_AGE 1000 - int delay_speculative_age = 0; - vm_object_t m_object = VM_OBJECT_NULL; + int force_purge; + + assert(available_for_purge >= 0); + force_purge = 0; /* no force-purging */ #if VM_PRESSURE_EVENTS vm_pressure_level_t pressure_level; -#endif /* VM_PRESSURE_EVENTS */ - - VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START, - vm_pageout_vminfo.vm_pageout_freed_speculative, - vm_pageout_state.vm_pageout_inactive_clean, - vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, - vm_pageout_vminfo.vm_pageout_inactive_dirty_external); - flow_control.state = FCS_IDLE; - iq = &vm_pageout_queue_internal; - eq = &vm_pageout_queue_external; - sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; + pressure_level = memorystatus_vm_pressure_level; + if (pressure_level > kVMPressureNormal) { + if (pressure_level >= kVMPressureCritical) { + force_purge = vm_pageout_state.memorystatus_purge_on_critical; + } else if (pressure_level >= kVMPressureUrgent) { + force_purge = vm_pageout_state.memorystatus_purge_on_urgent; + } else if (pressure_level >= kVMPressureWarning) { + force_purge = vm_pageout_state.memorystatus_purge_on_warning; + } + } +#endif /* VM_PRESSURE_EVENTS */ - XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0); + if (available_for_purge || force_purge) { + memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START); - /* Ask the pmap layer to return any pages it no longer needs. */ - uint64_t pmap_wired_pages_freed = pmap_release_pages_fast(); + VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0); + if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) { + VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1); + VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0); + memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END); - vm_page_lock_queues(); + return VM_PAGEOUT_SCAN_NEXT_ITERATION; + } + VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1); + memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END); + } - vm_page_wire_count -= pmap_wired_pages_freed; + return VM_PAGEOUT_SCAN_PROCEED; +} - delayed_unlock = 1; +/* + * This function is called only from vm_pageout_scan and + * it will try to age the next speculative Q if the oldest + * one is empty. + */ +static int +vps_age_speculative_queue(boolean_t force_speculative_aging) +{ +#define DELAY_SPECULATIVE_AGE 1000 /* - * Calculate the max number of referenced pages on the inactive - * queue that we will reactivate. + * try to pull pages from the aging bins... + * see vm_page.h for an explanation of how + * this mechanism works */ - reactivated_this_call = 0; - reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count + - vm_page_inactive_count); - inactive_reclaim_run = 0; + boolean_t can_steal = FALSE; + int num_scanned_queues; + static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/ + mach_timespec_t ts; + struct vm_speculative_age_q *aq; + struct vm_speculative_age_q *sq; - vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; - /* - * We must limit the rate at which we send pages to the pagers - * so that we don't tie up too many pages in the I/O queues. - * We implement a throttling mechanism using the laundry count - * to limit the number of pages outstanding to the default - * and external pagers. We can bypass the throttles and look - * for clean pages if the pageout queues don't drain in a timely - * fashion since this may indicate that the pageout paths are - * stalled waiting for memory, which only we can provide. - */ + aq = &vm_page_queue_speculative[speculative_steal_index]; -Restart: + num_scanned_queues = 0; + while (vm_page_queue_empty(&aq->age_q) && + num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) { + speculative_steal_index++; - assert(object == NULL); - assert(delayed_unlock != 0); + if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) { + speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q; + } - vm_page_anonymous_min = vm_page_inactive_target / 20; + aq = &vm_page_queue_speculative[speculative_steal_index]; + } - if (vm_pageout_state.vm_page_speculative_percentage > 50) { - vm_pageout_state.vm_page_speculative_percentage = 50; - } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) { - vm_pageout_state.vm_page_speculative_percentage = 1; + if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) { + /* + * XXX We've scanned all the speculative + * queues but still haven't found one + * that is not empty, even though + * vm_page_speculative_count is not 0. + */ + if (!vm_page_queue_empty(&sq->age_q)) { + return VM_PAGEOUT_SCAN_NEXT_ITERATION; + } +#if DEVELOPMENT || DEBUG + panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count); +#endif + /* readjust... */ + vm_page_speculative_count = 0; + /* ... and continue */ + return VM_PAGEOUT_SCAN_NEXT_ITERATION; } - vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count + - vm_page_inactive_count); + if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) { + can_steal = TRUE; + } else { + if (!delay_speculative_age) { + mach_timespec_t ts_fully_aged; - for (;;) { - vm_page_t m; + ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000; + ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000) + * 1000 * NSEC_PER_USEC; - DTRACE_VM2(rev, int, 1, (uint64_t *), NULL); + ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts); - if (vm_upl_wait_for_pages < 0) { - vm_upl_wait_for_pages = 0; + clock_sec_t sec; + clock_nsec_t nsec; + clock_get_system_nanotime(&sec, &nsec); + ts.tv_sec = (unsigned int) sec; + ts.tv_nsec = nsec; + + if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) { + can_steal = TRUE; + } else { + delay_speculative_age++; + } + } else { + delay_speculative_age++; + if (delay_speculative_age == DELAY_SPECULATIVE_AGE) { + delay_speculative_age = 0; + } } + } + if (can_steal == TRUE) { + vm_page_speculate_ageit(aq); + } - delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages; + return VM_PAGEOUT_SCAN_PROCEED; +} - if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) { - delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX; - } +/* + * This function is called only from vm_pageout_scan and + * it evicts a single VM object from the cache. + */ +static int inline +vps_object_cache_evict(vm_object_t *object_to_unlock) +{ + static int cache_evict_throttle = 0; + struct vm_speculative_age_q *sq; -#if CONFIG_SECLUDED_MEMORY - /* - * Deal with secluded_q overflow. - */ - if (vm_page_secluded_count > vm_page_secluded_target) { - vm_page_t secluded_page; + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; - /* - * SECLUDED_AGING_BEFORE_ACTIVE: - * Excess secluded pages go to the active queue and - * will later go to the inactive queue. - */ - assert((vm_page_secluded_count_free + - vm_page_secluded_count_inuse) == - vm_page_secluded_count); - secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded); - assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q); - - vm_page_queues_remove(secluded_page, FALSE); - assert(!secluded_page->vmp_fictitious); - assert(!VM_PAGE_WIRED(secluded_page)); - - if (secluded_page->vmp_object == 0) { - /* transfer to free queue */ - assert(secluded_page->vmp_busy); - secluded_page->vmp_snext = local_freeq; - local_freeq = secluded_page; - local_freed++; - } else { - /* transfer to head of active queue */ - vm_page_enqueue_active(secluded_page, FALSE); - secluded_page = VM_PAGE_NULL; - } - } -#endif /* CONFIG_SECLUDED_MEMORY */ + if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) { + int pages_evicted; - assert(delayed_unlock); + if (*object_to_unlock != NULL) { + vm_object_unlock(*object_to_unlock); + *object_to_unlock = NULL; + } + KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0); - /* - * maintain our balance - */ - vm_page_balance_inactive(1); + pages_evicted = vm_object_cache_evict(100, 10); + KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0); - /********************************************************************** - * above this point we're playing with the active and secluded queues - * below this point we're playing with the throttling mechanisms - * and the inactive queue - **********************************************************************/ + if (pages_evicted) { + vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted; - if (vm_page_free_count + local_freed >= vm_page_free_target) { - vm_pageout_scan_wants_object = VM_OBJECT_NULL; + VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE, + vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0); + memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE); - vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, - VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); /* - * make sure the pageout I/O threads are running - * throttled in case there are still requests - * in the laundry... since we have met our targets - * we don't need the laundry to be cleaned in a timely - * fashion... so let's avoid interfering with foreground - * activity + * we just freed up to 100 pages, + * so go back to the top of the main loop + * and re-evaulate the memory situation */ - vm_pageout_adjust_eq_iothrottle(eq, TRUE); + return VM_PAGEOUT_SCAN_NEXT_ITERATION; + } else { + cache_evict_throttle = 1000; + } + } + if (cache_evict_throttle) { + cache_evict_throttle--; + } - lck_mtx_lock(&vm_page_queue_free_lock); + return VM_PAGEOUT_SCAN_PROCEED; +} - if ((vm_page_free_count >= vm_page_free_target) && - (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) { - /* - * done - we have met our target *and* - * there is no one waiting for a page. - */ -return_from_scan: - assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); - - VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE, - vm_pageout_state.vm_pageout_inactive, - vm_pageout_state.vm_pageout_inactive_used, 0, 0); - VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END, - vm_pageout_vminfo.vm_pageout_freed_speculative, - vm_pageout_state.vm_pageout_inactive_clean, - vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, - vm_pageout_vminfo.vm_pageout_inactive_dirty_external); - return; - } - lck_mtx_unlock(&vm_page_queue_free_lock); - } +/* + * This function is called only from vm_pageout_scan and + * it calculates the filecache min. that needs to be maintained + * as we start to steal pages. + */ +static void +vps_calculate_filecache_min(void) +{ + int divisor = vm_pageout_state.vm_page_filecache_min_divisor; +#if CONFIG_JETSAM + /* + * don't let the filecache_min fall below 15% of available memory + * on systems with an active compressor that isn't nearing its + * limits w/r to accepting new data + * + * on systems w/o the compressor/swapper, the filecache is always + * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY + * since most (if not all) of the anonymous pages are in the + * throttled queue (which isn't counted as available) which + * effectively disables this filter + */ + if (vm_compressor_low_on_space() || divisor == 0) { + vm_pageout_state.vm_page_filecache_min = 0; + } else { + vm_pageout_state.vm_page_filecache_min = + ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor; + } +#else + if (vm_compressor_out_of_space() || divisor == 0) { + vm_pageout_state.vm_page_filecache_min = 0; + } else { /* - * Before anything, we check if we have any ripe volatile - * objects around. If so, try to purge the first object. - * If the purge fails, fall through to reclaim a page instead. - * If the purge succeeds, go back to the top and reevalute - * the new memory situation. + * don't let the filecache_min fall below the specified critical level */ + vm_pageout_state.vm_page_filecache_min = + ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor; + } +#endif + if (vm_page_free_count < (vm_page_free_reserved / 4)) { + vm_pageout_state.vm_page_filecache_min = 0; + } +} - assert(available_for_purge >= 0); - force_purge = 0; /* no force-purging */ - -#if VM_PRESSURE_EVENTS - pressure_level = memorystatus_vm_pressure_level; - - if (pressure_level > kVMPressureNormal) { - if (pressure_level >= kVMPressureCritical) { - force_purge = vm_pageout_state.memorystatus_purge_on_critical; - } else if (pressure_level >= kVMPressureUrgent) { - force_purge = vm_pageout_state.memorystatus_purge_on_urgent; - } else if (pressure_level >= kVMPressureWarning) { - force_purge = vm_pageout_state.memorystatus_purge_on_warning; - } - } -#endif /* VM_PRESSURE_EVENTS */ - - if (available_for_purge || force_purge) { - if (object != NULL) { - vm_object_unlock(object); - object = NULL; - } - - memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START); - - VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0); - if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) { - VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1); - VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0); - memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END); - continue; - } - VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1); - memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END); - } +/* + * This function is called only from vm_pageout_scan and + * it updates the flow control time to detect if VM pageoutscan + * isn't making progress. + */ +static void +vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control) +{ + mach_timespec_t ts; + clock_sec_t sec; + clock_nsec_t nsec; - if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) { - /* - * try to pull pages from the aging bins... - * see vm_page.h for an explanation of how - * this mechanism works - */ - struct vm_speculative_age_q *aq; - boolean_t can_steal = FALSE; - int num_scanned_queues; + ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000; + ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC; + clock_get_system_nanotime(&sec, &nsec); + flow_control->ts.tv_sec = (unsigned int) sec; + flow_control->ts.tv_nsec = nsec; + ADD_MACH_TIMESPEC(&flow_control->ts, &ts); - aq = &vm_page_queue_speculative[speculative_steal_index]; + flow_control->state = FCS_DELAYED; - num_scanned_queues = 0; - while (vm_page_queue_empty(&aq->age_q) && - num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) { - speculative_steal_index++; + vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++; +} - if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) { - speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q; - } +/* + * This function is called only from vm_pageout_scan and + * it is the flow control logic of VM pageout scan which + * controls if it should block and for how long. + * Any blocking of vm_pageout_scan happens ONLY in this function. + */ +static int +vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock, + vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count) +{ + boolean_t exceeded_burst_throttle = FALSE; + unsigned int msecs = 0; + uint32_t inactive_external_count; + mach_timespec_t ts; + struct vm_pageout_queue *iq; + struct vm_pageout_queue *eq; + struct vm_speculative_age_q *sq; - aq = &vm_page_queue_speculative[speculative_steal_index]; - } + iq = &vm_pageout_queue_internal; + eq = &vm_pageout_queue_external; + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; - if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) { + /* + * Sometimes we have to pause: + * 1) No inactive pages - nothing to do. + * 2) Loop control - no acceptable pages found on the inactive queue + * within the last vm_pageout_burst_inactive_throttle iterations + * 3) Flow control - default pageout queue is full + */ + if (vm_page_queue_empty(&vm_page_queue_inactive) && + vm_page_queue_empty(&vm_page_queue_anonymous) && + vm_page_queue_empty(&vm_page_queue_cleaned) && + vm_page_queue_empty(&sq->age_q)) { + VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1); + msecs = vm_pageout_state.vm_pageout_empty_wait; + } else if (inactive_burst_count >= + MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle, + (vm_page_inactive_count + + vm_page_speculative_count))) { + VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1); + msecs = vm_pageout_state.vm_pageout_burst_wait; + + exceeded_burst_throttle = TRUE; + } else if (VM_PAGE_Q_THROTTLED(iq) && + VM_DYNAMIC_PAGING_ENABLED()) { + clock_sec_t sec; + clock_nsec_t nsec; + + switch (flow_control->state) { + case FCS_IDLE: + if ((vm_page_free_count + *local_freed) < vm_page_free_target && + vm_pageout_state.vm_restricted_to_single_processor == FALSE) { /* - * XXX We've scanned all the speculative - * queues but still haven't found one - * that is not empty, even though - * vm_page_speculative_count is not 0. + * since the compressor is running independently of vm_pageout_scan + * let's not wait for it just yet... as long as we have a healthy supply + * of filecache pages to work with, let's keep stealing those. */ - if (!vm_page_queue_empty(&sq->age_q)) { - continue; + inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count; + + if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min && + (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) { + *anons_grabbed = ANONS_GRABBED_LIMIT; + VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1); + return VM_PAGEOUT_SCAN_PROCEED; } -#if DEVELOPMENT || DEBUG - panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count); -#endif - /* readjust... */ - vm_page_speculative_count = 0; - /* ... and continue */ - continue; } - if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) { - can_steal = TRUE; - } else { - if (!delay_speculative_age) { - mach_timespec_t ts_fully_aged; + vps_flow_control_reset_deadlock_timer(flow_control); + msecs = vm_pageout_state.vm_pageout_deadlock_wait; - ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000; - ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000) - * 1000 * NSEC_PER_USEC; + break; - ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts); + case FCS_DELAYED: + clock_get_system_nanotime(&sec, &nsec); + ts.tv_sec = (unsigned int) sec; + ts.tv_nsec = nsec; - clock_sec_t sec; - clock_nsec_t nsec; - clock_get_system_nanotime(&sec, &nsec); - ts.tv_sec = (unsigned int) sec; - ts.tv_nsec = nsec; + if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) { + /* + * the pageout thread for the default pager is potentially + * deadlocked since the + * default pager queue has been throttled for more than the + * allowable time... we need to move some clean pages or dirty + * pages belonging to the external pagers if they aren't throttled + * vm_page_free_wanted represents the number of threads currently + * blocked waiting for pages... we'll move one page for each of + * these plus a fixed amount to break the logjam... once we're done + * moving this number of pages, we'll re-enter the FSC_DELAYED state + * with a new timeout target since we have no way of knowing + * whether we've broken the deadlock except through observation + * of the queue associated with the default pager... we need to + * stop moving pages and allow the system to run to see what + * state it settles into. + */ - if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) { - can_steal = TRUE; - } else { - delay_speculative_age++; - } - } else { - delay_speculative_age++; - if (delay_speculative_age == DELAY_SPECULATIVE_AGE) { - delay_speculative_age = 0; - } - } + *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief + + vm_page_free_wanted + vm_page_free_wanted_privileged; + VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1); + flow_control->state = FCS_DEADLOCK_DETECTED; + thread_wakeup((event_t) &vm_pageout_garbage_collect); + return VM_PAGEOUT_SCAN_PROCEED; } - if (can_steal == TRUE) { - vm_page_speculate_ageit(aq); - } - } - force_speculative_aging = FALSE; - - if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) { - int pages_evicted; + /* + * just resniff instead of trying + * to compute a new delay time... we're going to be + * awakened immediately upon a laundry completion, + * so we won't wait any longer than necessary + */ + msecs = vm_pageout_state.vm_pageout_idle_wait; + break; - if (object != NULL) { - vm_object_unlock(object); - object = NULL; + case FCS_DEADLOCK_DETECTED: + if (*vm_pageout_deadlock_target) { + return VM_PAGEOUT_SCAN_PROCEED; } - KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0); - pages_evicted = vm_object_cache_evict(100, 10); + vps_flow_control_reset_deadlock_timer(flow_control); + msecs = vm_pageout_state.vm_pageout_deadlock_wait; - KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0); + break; + } + } else { + /* + * No need to pause... + */ + return VM_PAGEOUT_SCAN_PROCEED; + } - if (pages_evicted) { - vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted; + vm_pageout_scan_wants_object = VM_OBJECT_NULL; - VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE, - vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0); - memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE); + vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed, + VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); - /* - * we just freed up to 100 pages, - * so go back to the top of the main loop - * and re-evaulate the memory situation - */ - continue; - } else { - cache_evict_throttle = 1000; - } - } - if (cache_evict_throttle) { - cache_evict_throttle--; - } + if (vm_page_free_count >= vm_page_free_target) { + /* + * we're here because + * 1) someone else freed up some pages while we had + * the queues unlocked above + * and we've hit one of the 3 conditions that + * cause us to pause the pageout scan thread + * + * since we already have enough free pages, + * let's avoid stalling and return normally + * + * before we return, make sure the pageout I/O threads + * are running throttled in case there are still requests + * in the laundry... since we have enough free pages + * we don't need the laundry to be cleaned in a timely + * fashion... so let's avoid interfering with foreground + * activity + * + * we don't want to hold vm_page_queue_free_lock when + * calling vm_pageout_adjust_eq_iothrottle (since it + * may cause other locks to be taken), we do the intitial + * check outside of the lock. Once we take the lock, + * we recheck the condition since it may have changed. + * if it has, no problem, we will make the threads + * non-throttled before actually blocking + */ + vm_pageout_adjust_eq_iothrottle(eq, TRUE); + } + lck_mtx_lock(&vm_page_queue_free_lock); - divisor = vm_pageout_state.vm_page_filecache_min_divisor; + if (vm_page_free_count >= vm_page_free_target && + (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) { + return VM_PAGEOUT_SCAN_DONE_RETURN; + } + lck_mtx_unlock(&vm_page_queue_free_lock); -#if CONFIG_JETSAM + if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) { /* - * don't let the filecache_min fall below 15% of available memory - * on systems with an active compressor that isn't nearing its - * limits w/r to accepting new data + * we're most likely about to block due to one of + * the 3 conditions that cause vm_pageout_scan to + * not be able to make forward progress w/r + * to providing new pages to the free queue, + * so unthrottle the I/O threads in case we + * have laundry to be cleaned... it needs + * to be completed ASAP. * - * on systems w/o the compressor/swapper, the filecache is always - * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY - * since most (if not all) of the anonymous pages are in the - * throttled queue (which isn't counted as available) which - * effectively disables this filter + * even if we don't block, we want the io threads + * running unthrottled since the sum of free + + * clean pages is still under our free target + */ + vm_pageout_adjust_eq_iothrottle(eq, FALSE); + } + if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) { + /* + * if we get here we're below our free target and + * we're stalling due to a full laundry queue or + * we don't have any inactive pages other then + * those in the clean queue... + * however, we have pages on the clean queue that + * can be moved to the free queue, so let's not + * stall the pageout scan */ - if (vm_compressor_low_on_space() || divisor == 0) { - vm_pageout_state.vm_page_filecache_min = 0; + flow_control->state = FCS_IDLE; + return VM_PAGEOUT_SCAN_PROCEED; + } + if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) { + flow_control->state = FCS_IDLE; + return VM_PAGEOUT_SCAN_PROCEED; + } + + VM_CHECK_MEMORYSTATUS; + + if (flow_control->state != FCS_IDLE) { + VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1); + } + + iq->pgo_throttled = TRUE; + assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC); + + vm_page_unlock_queues(); + + assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); + + VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START, + iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0); + memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START); + + thread_block(THREAD_CONTINUE_NULL); + + VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END, + iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0); + memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END); + + vm_page_lock_queues(); + + iq->pgo_throttled = FALSE; + + vps_init_page_targets(); + + return VM_PAGEOUT_SCAN_NEXT_ITERATION; +} + +/* + * This function is called only from vm_pageout_scan and + * it will find and return the most appropriate page to be + * reclaimed. + */ +static int +vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous, + boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call) +{ + vm_page_t m = NULL; + vm_object_t m_object = VM_OBJECT_NULL; + uint32_t inactive_external_count; + struct vm_speculative_age_q *sq; + struct vm_pageout_queue *iq; + int retval = VM_PAGEOUT_SCAN_PROCEED; + + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; + iq = &vm_pageout_queue_internal; + + *is_page_from_bg_q = FALSE; + + m = NULL; + m_object = VM_OBJECT_NULL; + + if (VM_DYNAMIC_PAGING_ENABLED()) { + assert(vm_page_throttled_count == 0); + assert(vm_page_queue_empty(&vm_page_queue_throttled)); + } + + /* + * Try for a clean-queue inactive page. + * These are pages that vm_pageout_scan tried to steal earlier, but + * were dirty and had to be cleaned. Pick them up now that they are clean. + */ + if (!vm_page_queue_empty(&vm_page_queue_cleaned)) { + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); + + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); + + goto found_page; + } + + /* + * The next most eligible pages are ones we paged in speculatively, + * but which have not yet been touched and have been aged out. + */ + if (!vm_page_queue_empty(&sq->age_q)) { + m = (vm_page_t) vm_page_queue_first(&sq->age_q); + + assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q); + + if (!m->vmp_dirty || force_anonymous == FALSE) { + goto found_page; } else { - vm_pageout_state.vm_page_filecache_min = - ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor; + m = NULL; } -#else - if (vm_compressor_out_of_space() || divisor == 0) { - vm_pageout_state.vm_page_filecache_min = 0; - } else { + } + +#if CONFIG_BACKGROUND_QUEUE + if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) { + vm_object_t bg_m_object = NULL; + + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background); + + bg_m_object = VM_PAGE_OBJECT(m); + + if (!VM_PAGE_PAGEABLE(m)) { /* - * don't let the filecache_min fall below the specified critical level + * This page is on the background queue + * but not on a pageable queue. This is + * likely a transient state and whoever + * took it out of its pageable queue + * will likely put it back on a pageable + * queue soon but we can't deal with it + * at this point, so let's ignore this + * page. */ - vm_pageout_state.vm_page_filecache_min = - ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor; - } -#endif - if (vm_page_free_count < (vm_page_free_reserved / 4)) { - vm_pageout_state.vm_page_filecache_min = 0; + } else if (force_anonymous == FALSE || bg_m_object->internal) { + if (bg_m_object->internal && + (VM_PAGE_Q_THROTTLED(iq) || + vm_compressor_out_of_space() == TRUE || + vm_page_free_count < (vm_page_free_reserved / 4))) { + vm_pageout_skipped_bq_internal++; + } else { + *is_page_from_bg_q = TRUE; + + if (bg_m_object->internal) { + vm_pageout_vminfo.vm_pageout_considered_bq_internal++; + } else { + vm_pageout_vminfo.vm_pageout_considered_bq_external++; + } + goto found_page; + } } + } +#endif /* CONFIG_BACKGROUND_QUEUE */ - exceeded_burst_throttle = FALSE; - /* - * Sometimes we have to pause: - * 1) No inactive pages - nothing to do. - * 2) Loop control - no acceptable pages found on the inactive queue - * within the last vm_pageout_burst_inactive_throttle iterations - * 3) Flow control - default pageout queue is full - */ - if (vm_page_queue_empty(&vm_page_queue_inactive) && - vm_page_queue_empty(&vm_page_queue_anonymous) && - vm_page_queue_empty(&vm_page_queue_cleaned) && - vm_page_queue_empty(&sq->age_q)) { - VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1); - msecs = vm_pageout_state.vm_pageout_empty_wait; - goto vm_pageout_scan_delay; - } else if (inactive_burst_count >= - MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle, - (vm_page_inactive_count + - vm_page_speculative_count))) { - VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1); - msecs = vm_pageout_state.vm_pageout_burst_wait; - - exceeded_burst_throttle = TRUE; - goto vm_pageout_scan_delay; - } else if (VM_PAGE_Q_THROTTLED(iq) && - VM_DYNAMIC_PAGING_ENABLED()) { - clock_sec_t sec; - clock_nsec_t nsec; + inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count; - switch (flow_control.state) { - case FCS_IDLE: - if ((vm_page_free_count + local_freed) < vm_page_free_target && - vm_pageout_state.vm_restricted_to_single_processor == FALSE) { - /* - * since the compressor is running independently of vm_pageout_scan - * let's not wait for it just yet... as long as we have a healthy supply - * of filecache pages to work with, let's keep stealing those. - */ - inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count; + if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) || + (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) { + *grab_anonymous = TRUE; + *anons_grabbed = 0; - if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min && - (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) { - anons_grabbed = ANONS_GRABBED_LIMIT; - VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1); - goto consider_inactive; - } - } -reset_deadlock_timer: - ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000; - ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC; - clock_get_system_nanotime(&sec, &nsec); - flow_control.ts.tv_sec = (unsigned int) sec; - flow_control.ts.tv_nsec = nsec; - ADD_MACH_TIMESPEC(&flow_control.ts, &ts); - - flow_control.state = FCS_DELAYED; - msecs = vm_pageout_state.vm_pageout_deadlock_wait; - - vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++; - break; + vm_pageout_vminfo.vm_pageout_skipped_external++; + goto want_anonymous; + } + *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min); - case FCS_DELAYED: - clock_get_system_nanotime(&sec, &nsec); - ts.tv_sec = (unsigned int) sec; - ts.tv_nsec = nsec; +#if CONFIG_JETSAM + /* If the file-backed pool has accumulated + * significantly more pages than the jetsam + * threshold, prefer to reclaim those + * inline to minimise compute overhead of reclaiming + * anonymous pages. + * This calculation does not account for the CPU local + * external page queues, as those are expected to be + * much smaller relative to the global pools. + */ - if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) { - /* - * the pageout thread for the default pager is potentially - * deadlocked since the - * default pager queue has been throttled for more than the - * allowable time... we need to move some clean pages or dirty - * pages belonging to the external pagers if they aren't throttled - * vm_page_free_wanted represents the number of threads currently - * blocked waiting for pages... we'll move one page for each of - * these plus a fixed amount to break the logjam... once we're done - * moving this number of pages, we'll re-enter the FSC_DELAYED state - * with a new timeout target since we have no way of knowing - * whether we've broken the deadlock except through observation - * of the queue associated with the default pager... we need to - * stop moving pages and allow the system to run to see what - * state it settles into. - */ - vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief + - vm_page_free_wanted + vm_page_free_wanted_privileged; - VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1); - flow_control.state = FCS_DEADLOCK_DETECTED; - thread_wakeup((event_t) &vm_pageout_garbage_collect); - goto consider_inactive; - } - /* - * just resniff instead of trying - * to compute a new delay time... we're going to be - * awakened immediately upon a laundry completion, - * so we won't wait any longer than necessary - */ - msecs = vm_pageout_state.vm_pageout_idle_wait; - break; + struct vm_pageout_queue *eq = &vm_pageout_queue_external; - case FCS_DEADLOCK_DETECTED: - if (vm_pageout_deadlock_target) { - goto consider_inactive; - } - goto reset_deadlock_timer; + if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) { + if (vm_page_pageable_external_count > + vm_pageout_state.vm_page_filecache_min) { + if ((vm_page_pageable_external_count * + vm_pageout_memorystatus_fb_factor_dr) > + (memorystatus_available_pages_critical * + vm_pageout_memorystatus_fb_factor_nr)) { + *grab_anonymous = FALSE; + + VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1); } -vm_pageout_scan_delay: - vm_pageout_scan_wants_object = VM_OBJECT_NULL; + } + if (*grab_anonymous) { + VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1); + } + } +#endif /* CONFIG_JETSAM */ - vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, - VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); +want_anonymous: + if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) { + if (!vm_page_queue_empty(&vm_page_queue_inactive)) { + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); - if (vm_page_free_count >= vm_page_free_target) { - /* - * we're here because - * 1) someone else freed up some pages while we had - * the queues unlocked above - * and we've hit one of the 3 conditions that - * cause us to pause the pageout scan thread - * - * since we already have enough free pages, - * let's avoid stalling and return normally - * - * before we return, make sure the pageout I/O threads - * are running throttled in case there are still requests - * in the laundry... since we have enough free pages - * we don't need the laundry to be cleaned in a timely - * fashion... so let's avoid interfering with foreground - * activity - * - * we don't want to hold vm_page_queue_free_lock when - * calling vm_pageout_adjust_eq_iothrottle (since it - * may cause other locks to be taken), we do the intitial - * check outside of the lock. Once we take the lock, - * we recheck the condition since it may have changed. - * if it has, no problem, we will make the threads - * non-throttled before actually blocking - */ - vm_pageout_adjust_eq_iothrottle(eq, TRUE); - } - lck_mtx_lock(&vm_page_queue_free_lock); + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); + *anons_grabbed = 0; - if (vm_page_free_count >= vm_page_free_target && - (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) { - goto return_from_scan; - } - lck_mtx_unlock(&vm_page_queue_free_lock); + if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) { + if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { + if ((++(*reactivated_this_call) % 100)) { + vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++; - if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) { - /* - * we're most likely about to block due to one of - * the 3 conditions that cause vm_pageout_scan to - * not be able to make forward progress w/r - * to providing new pages to the free queue, - * so unthrottle the I/O threads in case we - * have laundry to be cleaned... it needs - * to be completed ASAP. - * - * even if we don't block, we want the io threads - * running unthrottled since the sum of free + - * clean pages is still under our free target - */ - vm_pageout_adjust_eq_iothrottle(eq, FALSE); - } - if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) { - /* - * if we get here we're below our free target and - * we're stalling due to a full laundry queue or - * we don't have any inactive pages other then - * those in the clean queue... - * however, we have pages on the clean queue that - * can be moved to the free queue, so let's not - * stall the pageout scan - */ - flow_control.state = FCS_IDLE; - goto consider_inactive; - } - if (flow_control.state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) { - flow_control.state = FCS_IDLE; - goto consider_inactive; - } + vm_page_activate(m); + counter_inc(&vm_statistics_reactivations); +#if CONFIG_BACKGROUND_QUEUE +#if DEVELOPMENT || DEBUG + if (*is_page_from_bg_q == TRUE) { + if (m_object->internal) { + vm_pageout_rejected_bq_internal++; + } else { + vm_pageout_rejected_bq_external++; + } + } +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_BACKGROUND_QUEUE */ + vm_pageout_state.vm_pageout_inactive_used++; - VM_CHECK_MEMORYSTATUS; + m = NULL; + retval = VM_PAGEOUT_SCAN_NEXT_ITERATION; + + goto found_page; + } - if (flow_control.state != FCS_IDLE) { - VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1); + /* + * steal 1 of the file backed pages even if + * we are under the limit that has been set + * for a healthy filecache + */ + } } + goto found_page; + } + } + if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); - iq->pgo_throttled = TRUE; - assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC); + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); + *anons_grabbed += 1; - counter(c_vm_pageout_scan_block++); + goto found_page; + } - vm_page_unlock_queues(); + m = NULL; + +found_page: + *victim_page = m; + + return retval; +} - assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); +/* + * This function is called only from vm_pageout_scan and + * it will put a page back on the active/inactive queue + * if we can't reclaim it for some reason. + */ +static void +vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q) +{ + if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) { + vm_page_enqueue_inactive(m, FALSE); + } else { + vm_page_activate(m); + } + +#if CONFIG_BACKGROUND_QUEUE +#if DEVELOPMENT || DEBUG + vm_object_t m_object = VM_PAGE_OBJECT(m); + + if (page_from_bg_q == TRUE) { + if (m_object->internal) { + vm_pageout_rejected_bq_internal++; + } else { + vm_pageout_rejected_bq_external++; + } + } +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_BACKGROUND_QUEUE */ +} + +/* + * This function is called only from vm_pageout_scan and + * it will try to grab the victim page's VM object (m_object) + * which differs from the previous victim page's object (object). + */ +static int +vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q) +{ + struct vm_speculative_age_q *sq; + + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; + + /* + * the object associated with candidate page is + * different from the one we were just working + * with... dump the lock if we still own it + */ + if (*object != NULL) { + vm_object_unlock(*object); + *object = NULL; + } + /* + * Try to lock object; since we've alread got the + * page queues lock, we can only 'try' for this one. + * if the 'try' fails, we need to do a mutex_pause + * to allow the owner of the object lock a chance to + * run... otherwise, we're likely to trip over this + * object in the same state as we work our way through + * the queue... clumps of pages associated with the same + * object are fairly typical on the inactive and active queues + */ + if (!vm_object_lock_try_scan(m_object)) { + vm_page_t m_want = NULL; + + vm_pageout_vminfo.vm_pageout_inactive_nolock++; + + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1); + } + + pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m)); + + m->vmp_reference = FALSE; + + if (!m_object->object_is_shared_cache) { + /* + * don't apply this optimization if this is the shared cache + * object, it's too easy to get rid of very hot and important + * pages... + * m->vmp_object must be stable since we hold the page queues lock... + * we can update the scan_collisions field sans the object lock + * since it is a separate field and this is the only spot that does + * a read-modify-write operation and it is never executed concurrently... + * we can asynchronously set this field to 0 when creating a UPL, so it + * is possible for the value to be a bit non-determistic, but that's ok + * since it's only used as a hint + */ + m_object->scan_collisions = 1; + } + if (!vm_page_queue_empty(&vm_page_queue_cleaned)) { + m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); + } else if (!vm_page_queue_empty(&sq->age_q)) { + m_want = (vm_page_t) vm_page_queue_first(&sq->age_q); + } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) && + !vm_page_queue_empty(&vm_page_queue_inactive)) { + m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); + } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { + m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); + } + + /* + * this is the next object we're going to be interested in + * try to make sure its available after the mutex_pause + * returns control + */ + if (m_want) { + vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want); + } + + vps_requeue_page(m, page_prev_q_state, page_from_bg_q); + + return VM_PAGEOUT_SCAN_NEXT_ITERATION; + } else { + *object = m_object; + vm_pageout_scan_wants_object = VM_OBJECT_NULL; + } + + return VM_PAGEOUT_SCAN_PROCEED; +} + +/* + * This function is called only from vm_pageout_scan and + * it notices that pageout scan may be rendered ineffective + * due to a FS deadlock and will jetsam a process if possible. + * If jetsam isn't supported, it'll move the page to the active + * queue to try and get some different pages pushed onwards so + * we can try to get out of this scenario. + */ +static void +vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit, + int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q) +{ + struct vm_pageout_queue *eq; + vm_object_t cur_object = VM_OBJECT_NULL; + + cur_object = *object; + + eq = &vm_pageout_queue_external; + + if (cur_object->internal == FALSE) { + /* + * we need to break up the following potential deadlock case... + * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written. + * b) The thread doing the writing is waiting for pages while holding the truncate lock + * c) Most of the pages in the inactive queue belong to this file. + * + * we are potentially in this deadlock because... + * a) the external pageout queue is throttled + * b) we're done with the active queue and moved on to the inactive queue + * c) we've got a dirty external page + * + * since we don't know the reason for the external pageout queue being throttled we + * must suspect that we are deadlocked, so move the current page onto the active queue + * in an effort to cause a page from the active queue to 'age' to the inactive queue + * + * if we don't have jetsam configured (i.e. we have a dynamic pager), set + * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous + * pool the next time we select a victim page... if we can make enough new free pages, + * the deadlock will break, the external pageout queue will empty and it will no longer + * be throttled + * + * if we have jetsam configured, keep a count of the pages reactivated this way so + * that we can try to find clean pages in the active/inactive queues before + * deciding to jetsam a process + */ + vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++; + + vm_page_check_pageable_safe(m); + assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q); + vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq); + m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q; + vm_page_active_count++; + vm_page_pageable_external_count++; + + vm_pageout_adjust_eq_iothrottle(eq, FALSE); + +#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM + +#pragma unused(force_anonymous) + + *vm_pageout_inactive_external_forced_reactivate_limit -= 1; + + if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) { + *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; + /* + * Possible deadlock scenario so request jetsam action + */ + + assert(cur_object); + vm_object_unlock(cur_object); + + cur_object = VM_OBJECT_NULL; + + /* + * VM pageout scan needs to know we have dropped this lock and so set the + * object variable we got passed in to NULL. + */ + *object = VM_OBJECT_NULL; + + vm_page_unlock_queues(); + + VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START, + vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count); + + /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */ + if (memorystatus_kill_on_VM_page_shortage(FALSE) == TRUE) { + VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1); + } + + VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END, + vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count); + + vm_page_lock_queues(); + *delayed_unlock = 1; + } +#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */ + +#pragma unused(vm_pageout_inactive_external_forced_reactivate_limit) +#pragma unused(delayed_unlock) + + *force_anonymous = TRUE; +#endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */ + } else { + vm_page_activate(m); + counter_inc(&vm_statistics_reactivations); + +#if CONFIG_BACKGROUND_QUEUE +#if DEVELOPMENT || DEBUG + if (is_page_from_bg_q == TRUE) { + if (cur_object->internal) { + vm_pageout_rejected_bq_internal++; + } else { + vm_pageout_rejected_bq_external++; + } + } +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_BACKGROUND_QUEUE */ + + vm_pageout_state.vm_pageout_inactive_used++; + } +} + + +void +vm_page_balance_inactive(int max_to_move) +{ + vm_page_t m; + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + + if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) { + /* + * It is likely that the hibernation code path is + * dealing with these very queues as we are about + * to move pages around in/from them and completely + * change the linkage of the pages. + * + * And so we skip the rebalancing of these queues. + */ + return; + } + vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count + + vm_page_inactive_count + + vm_page_speculative_count); + + while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) { + VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1); + + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); + + assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q); + assert(!m->vmp_laundry); + assert(VM_PAGE_OBJECT(m) != kernel_object); + assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); + + DTRACE_VM2(scan, int, 1, (uint64_t *), NULL); + + /* + * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise... + * + * a TLB flush isn't really needed here since at worst we'll miss the reference bit being + * updated in the PTE if a remote processor still has this mapping cached in its TLB when the + * new reference happens. If no futher references happen on the page after that remote TLB flushes + * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue + * by pageout_scan, which is just fine since the last reference would have happened quite far + * in the past (TLB caches don't hang around for very long), and of course could just as easily + * have happened before we moved the page + */ + if (m->vmp_pmapped == TRUE) { + pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); + } + + /* + * The page might be absent or busy, + * but vm_page_deactivate can handle that. + * FALSE indicates that we don't want a H/W clear reference + */ + vm_page_deactivate_internal(m, FALSE); + } +} + + +/* + * vm_pageout_scan does the dirty work for the pageout daemon. + * It returns with both vm_page_queue_free_lock and vm_page_queue_lock + * held and vm_page_free_wanted == 0. + */ +void +vm_pageout_scan(void) +{ + unsigned int loop_count = 0; + unsigned int inactive_burst_count = 0; + unsigned int reactivated_this_call; + unsigned int reactivate_limit; + vm_page_t local_freeq = NULL; + int local_freed = 0; + int delayed_unlock; + int delayed_unlock_limit = 0; + int refmod_state = 0; + int vm_pageout_deadlock_target = 0; + struct vm_pageout_queue *iq; + struct vm_pageout_queue *eq; + struct vm_speculative_age_q *sq; + struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } }; + boolean_t inactive_throttled = FALSE; + vm_object_t object = NULL; + uint32_t inactive_reclaim_run; + boolean_t grab_anonymous = FALSE; + boolean_t force_anonymous = FALSE; + boolean_t force_speculative_aging = FALSE; + int anons_grabbed = 0; + int page_prev_q_state = 0; + boolean_t page_from_bg_q = FALSE; + uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0; + vm_object_t m_object = VM_OBJECT_NULL; + int retval = 0; + boolean_t lock_yield_check = FALSE; + + + VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START, + vm_pageout_vminfo.vm_pageout_freed_speculative, + vm_pageout_state.vm_pageout_inactive_clean, + vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, + vm_pageout_vminfo.vm_pageout_inactive_dirty_external); + + flow_control.state = FCS_IDLE; + iq = &vm_pageout_queue_internal; + eq = &vm_pageout_queue_external; + sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q]; + + /* Ask the pmap layer to return any pages it no longer needs. */ + uint64_t pmap_wired_pages_freed = pmap_release_pages_fast(); + + vm_page_lock_queues(); + + vm_page_wire_count -= pmap_wired_pages_freed; + + delayed_unlock = 1; + + /* + * Calculate the max number of referenced pages on the inactive + * queue that we will reactivate. + */ + reactivated_this_call = 0; + reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count + + vm_page_inactive_count); + inactive_reclaim_run = 0; + + vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; + + /* + * We must limit the rate at which we send pages to the pagers + * so that we don't tie up too many pages in the I/O queues. + * We implement a throttling mechanism using the laundry count + * to limit the number of pages outstanding to the default + * and external pagers. We can bypass the throttles and look + * for clean pages if the pageout queues don't drain in a timely + * fashion since this may indicate that the pageout paths are + * stalled waiting for memory, which only we can provide. + */ + + vps_init_page_targets(); + assert(object == NULL); + assert(delayed_unlock != 0); + + for (;;) { + vm_page_t m; + + DTRACE_VM2(rev, int, 1, (uint64_t *), NULL); + + if (lock_yield_check) { + lock_yield_check = FALSE; + + if (delayed_unlock++ > delayed_unlock_limit) { + int freed = local_freed; + + vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, + VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); + if (freed == 0) { + lck_mtx_yield(&vm_page_queue_lock); + } + } else if (vm_pageout_scan_wants_object) { + vm_page_unlock_queues(); + mutex_pause(0); + vm_page_lock_queues(); + } + } + + if (vm_upl_wait_for_pages < 0) { + vm_upl_wait_for_pages = 0; + } + + delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages; + + if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) { + delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX; + } + + vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed); + + assert(delayed_unlock); + + /* + * maintain our balance + */ + vm_page_balance_inactive(1); + + + /********************************************************************** + * above this point we're playing with the active and secluded queues + * below this point we're playing with the throttling mechanisms + * and the inactive queue + **********************************************************************/ + + if (vm_page_free_count + local_freed >= vm_page_free_target) { + vm_pageout_scan_wants_object = VM_OBJECT_NULL; + + vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, + VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); + /* + * make sure the pageout I/O threads are running + * throttled in case there are still requests + * in the laundry... since we have met our targets + * we don't need the laundry to be cleaned in a timely + * fashion... so let's avoid interfering with foreground + * activity + */ + vm_pageout_adjust_eq_iothrottle(eq, TRUE); + + lck_mtx_lock(&vm_page_queue_free_lock); + + if ((vm_page_free_count >= vm_page_free_target) && + (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) { + /* + * done - we have met our target *and* + * there is no one waiting for a page. + */ +return_from_scan: + assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); + + VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE, + vm_pageout_state.vm_pageout_inactive, + vm_pageout_state.vm_pageout_inactive_used, 0, 0); + VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END, + vm_pageout_vminfo.vm_pageout_freed_speculative, + vm_pageout_state.vm_pageout_inactive_clean, + vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, + vm_pageout_vminfo.vm_pageout_inactive_dirty_external); + + return; + } + lck_mtx_unlock(&vm_page_queue_free_lock); + } + + /* + * Before anything, we check if we have any ripe volatile + * objects around. If so, try to purge the first object. + * If the purge fails, fall through to reclaim a page instead. + * If the purge succeeds, go back to the top and reevalute + * the new memory situation. + */ + retval = vps_purge_object(); + + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { + /* + * Success + */ + if (object != NULL) { + vm_object_unlock(object); + object = NULL; + } + + lock_yield_check = FALSE; + continue; + } + + /* + * If our 'aged' queue is empty and we have some speculative pages + * in the other queues, let's go through and see if we need to age + * them. + * + * If we succeeded in aging a speculative Q or just that everything + * looks normal w.r.t queue age and queue counts, we keep going onward. + * + * If, for some reason, we seem to have a mismatch between the spec. + * page count and the page queues, we reset those variables and + * restart the loop (LD TODO: Track this better?). + */ + if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) { + retval = vps_age_speculative_queue(force_speculative_aging); + + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { + lock_yield_check = FALSE; + continue; + } + } + force_speculative_aging = FALSE; - VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START, - iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0); - memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START); + /* + * Check to see if we need to evict objects from the cache. + * + * Note: 'object' here doesn't have anything to do with + * the eviction part. We just need to make sure we have dropped + * any object lock we might be holding if we need to go down + * into the eviction logic. + */ + retval = vps_object_cache_evict(&object); - thread_block(THREAD_CONTINUE_NULL); + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { + lock_yield_check = FALSE; + continue; + } - VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END, - iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0); - memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END); - vm_page_lock_queues(); + /* + * Calculate our filecache_min that will affect the loop + * going forward. + */ + vps_calculate_filecache_min(); - iq->pgo_throttled = FALSE; + /* + * LD TODO: Use a structure to hold all state variables for a single + * vm_pageout_scan iteration and pass that structure to this function instead. + */ + retval = vps_flow_control(&flow_control, &anons_grabbed, &object, + &delayed_unlock, &local_freeq, &local_freed, + &vm_pageout_deadlock_target, inactive_burst_count); + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { if (loop_count >= vm_page_inactive_count) { loop_count = 0; } + inactive_burst_count = 0; - goto Restart; - /*NOTREACHED*/ - } + assert(object == NULL); + assert(delayed_unlock != 0); + lock_yield_check = FALSE; + continue; + } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) { + goto return_from_scan; + } flow_control.state = FCS_IDLE; -consider_inactive: + vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count), vm_pageout_inactive_external_forced_reactivate_limit); loop_count++; @@ -2438,157 +3097,20 @@ consider_inactive: /* * Choose a victim. */ - while (1) { -#if CONFIG_BACKGROUND_QUEUE - page_from_bg_q = FALSE; -#endif /* CONFIG_BACKGROUND_QUEUE */ - - m = NULL; - m_object = VM_OBJECT_NULL; - - if (VM_DYNAMIC_PAGING_ENABLED()) { - assert(vm_page_throttled_count == 0); - assert(vm_page_queue_empty(&vm_page_queue_throttled)); - } - - /* - * Try for a clean-queue inactive page. - * These are pages that vm_pageout_scan tried to steal earlier, but - * were dirty and had to be cleaned. Pick them up now that they are clean. - */ - if (!vm_page_queue_empty(&vm_page_queue_cleaned)) { - m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); - - assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); - - break; - } - /* - * The next most eligible pages are ones we paged in speculatively, - * but which have not yet been touched and have been aged out. - */ - if (!vm_page_queue_empty(&sq->age_q)) { - m = (vm_page_t) vm_page_queue_first(&sq->age_q); - - assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q); - - if (!m->vmp_dirty || force_anonymous == FALSE) { - break; - } else { - m = NULL; - } - } - -#if CONFIG_BACKGROUND_QUEUE - if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) { - vm_object_t bg_m_object = NULL; - - m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background); - - bg_m_object = VM_PAGE_OBJECT(m); - - if (!VM_PAGE_PAGEABLE(m)) { - /* - * This page is on the background queue - * but not on a pageable queue. This is - * likely a transient state and whoever - * took it out of its pageable queue - * will likely put it back on a pageable - * queue soon but we can't deal with it - * at this point, so let's ignore this - * page. - */ - } else if (force_anonymous == FALSE || bg_m_object->internal) { - if (bg_m_object->internal && - (VM_PAGE_Q_THROTTLED(iq) || - vm_compressor_out_of_space() == TRUE || - vm_page_free_count < (vm_page_free_reserved / 4))) { - vm_pageout_skipped_bq_internal++; - } else { - page_from_bg_q = TRUE; - - if (bg_m_object->internal) { - vm_pageout_vminfo.vm_pageout_considered_bq_internal++; - } else { - vm_pageout_vminfo.vm_pageout_considered_bq_external++; - } - break; - } - } - } -#endif - inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count; - - if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) || - (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) { - grab_anonymous = TRUE; - anons_grabbed = 0; - - vm_pageout_vminfo.vm_pageout_skipped_external++; - goto want_anonymous; - } - grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min); + m = NULL; + retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call); -#if CONFIG_JETSAM - /* If the file-backed pool has accumulated - * significantly more pages than the jetsam - * threshold, prefer to reclaim those - * inline to minimise compute overhead of reclaiming - * anonymous pages. - * This calculation does not account for the CPU local - * external page queues, as those are expected to be - * much smaller relative to the global pools. - */ - if (grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) { - if (vm_page_pageable_external_count > - vm_pageout_state.vm_page_filecache_min) { - if ((vm_page_pageable_external_count * - vm_pageout_memorystatus_fb_factor_dr) > - (memorystatus_available_pages_critical * - vm_pageout_memorystatus_fb_factor_nr)) { - grab_anonymous = FALSE; - - VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1); - } - } - if (grab_anonymous) { - VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1); - } - } -#endif /* CONFIG_JETSAM */ + if (m == NULL) { + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { + inactive_burst_count = 0; -want_anonymous: - if (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) { - if (!vm_page_queue_empty(&vm_page_queue_inactive)) { - m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); - - assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); - anons_grabbed = 0; - - if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) { - if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { - if ((++reactivated_this_call % 100)) { - vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++; - goto must_activate_page; - } - /* - * steal 1% of the file backed pages even if - * we are under the limit that has been set - * for a healthy filecache - */ - } - } - break; + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1); } - } - if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { - m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); - assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); - anons_grabbed++; - - break; + lock_yield_check = TRUE; + continue; } /* @@ -2603,17 +3125,20 @@ want_anonymous: VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1); if (!vm_page_queue_empty(&sq->age_q)) { - goto done_with_inactivepage; + lock_yield_check = TRUE; + continue; } if (vm_page_speculative_count) { force_speculative_aging = TRUE; - goto done_with_inactivepage; + lock_yield_check = TRUE; + continue; } panic("vm_pageout: no victim"); /* NOTREACHED */ } + assert(VM_PAGE_PAGEABLE(m)); m_object = VM_PAGE_OBJECT(m); force_anonymous = FALSE; @@ -2642,78 +3167,19 @@ want_anonymous: * already got the lock */ if (m_object != object) { + boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT); + /* - * the object associated with candidate page is - * different from the one we were just working - * with... dump the lock if we still own it - */ - if (object != NULL) { - vm_object_unlock(object); - object = NULL; - } - /* - * Try to lock object; since we've alread got the - * page queues lock, we can only 'try' for this one. - * if the 'try' fails, we need to do a mutex_pause - * to allow the owner of the object lock a chance to - * run... otherwise, we're likely to trip over this - * object in the same state as we work our way through - * the queue... clumps of pages associated with the same - * object are fairly typical on the inactive and active queues + * vps_switch_object() will always drop the 'object' lock first + * and then try to acquire the 'm_object' lock. So 'object' has to point to + * either 'm_object' or NULL. */ - if (!vm_object_lock_try_scan(m_object)) { - vm_page_t m_want = NULL; - - vm_pageout_vminfo.vm_pageout_inactive_nolock++; - - if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { - VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1); - } - - pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m)); - - m->vmp_reference = FALSE; - - if (!m_object->object_is_shared_cache) { - /* - * don't apply this optimization if this is the shared cache - * object, it's too easy to get rid of very hot and important - * pages... - * m->vmp_object must be stable since we hold the page queues lock... - * we can update the scan_collisions field sans the object lock - * since it is a separate field and this is the only spot that does - * a read-modify-write operation and it is never executed concurrently... - * we can asynchronously set this field to 0 when creating a UPL, so it - * is possible for the value to be a bit non-determistic, but that's ok - * since it's only used as a hint - */ - m_object->scan_collisions = 1; - } - if (!vm_page_queue_empty(&vm_page_queue_cleaned)) { - m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); - } else if (!vm_page_queue_empty(&sq->age_q)) { - m_want = (vm_page_t) vm_page_queue_first(&sq->age_q); - } else if ((grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || - vm_page_queue_empty(&vm_page_queue_anonymous)) && - !vm_page_queue_empty(&vm_page_queue_inactive)) { - m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); - } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) { - m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); - } - - /* - * this is the next object we're going to be interested in - * try to make sure its available after the mutex_pause - * returns control - */ - if (m_want) { - vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want); - } + retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q); - goto requeue_page; + if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) { + lock_yield_check = TRUE; + continue; } - object = m_object; - vm_pageout_scan_wants_object = VM_OBJECT_NULL; } assert(m_object == object); assert(VM_PAGE_OBJECT(m) == m_object); @@ -2729,24 +3195,11 @@ want_anonymous: if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1); } -requeue_page: - if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) { - vm_page_enqueue_inactive(m, FALSE); - } else { - vm_page_activate(m); - } -#if CONFIG_BACKGROUND_QUEUE -#if DEVELOPMENT || DEBUG - if (page_from_bg_q == TRUE) { - if (m_object->internal) { - vm_pageout_rejected_bq_internal++; - } else { - vm_pageout_rejected_bq_external++; - } - } -#endif -#endif - goto done_with_inactivepage; + + vps_requeue_page(m, page_prev_q_state, page_from_bg_q); + + lock_yield_check = TRUE; + continue; } /* @@ -2770,7 +3223,8 @@ requeue_page: * just leave it off the paging queues */ if (m->vmp_free_when_done || m->vmp_cleaning) { - goto done_with_inactivepage; + lock_yield_check = TRUE; + continue; } @@ -2839,7 +3293,9 @@ reclaim_page: } inactive_burst_count = 0; - goto done_with_inactivepage; + + lock_yield_check = TRUE; + continue; } if (object->copy == VM_OBJECT_NULL) { /* @@ -2915,18 +3371,15 @@ reclaim_page: /* deal with a rogue "reusable" page */ VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object); } - divisor = vm_pageout_state.vm_page_xpmapped_min_divisor; - if (divisor == 0) { + if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) { vm_pageout_state.vm_page_xpmapped_min = 0; } else { - vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / divisor; + vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor; } if (!m->vmp_no_cache && -#if CONFIG_BACKGROUND_QUEUE page_from_bg_q == FALSE && -#endif (m->vmp_reference || (m->vmp_xpmapped && !object->internal && (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) { /* @@ -2959,12 +3412,11 @@ reactivate_page: vm_page_deactivate(m); VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1); } else { -must_activate_page: /* * The page was/is being used, so put back on active list. */ vm_page_activate(m); - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); inactive_burst_count = 0; } #if CONFIG_BACKGROUND_QUEUE @@ -2976,14 +3428,16 @@ must_activate_page: vm_pageout_rejected_bq_external++; } } -#endif -#endif +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_BACKGROUND_QUEUE */ + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1); } vm_pageout_state.vm_pageout_inactive_used++; - goto done_with_inactivepage; + lock_yield_check = TRUE; + continue; } /* * Make sure we call pmap_get_refmod() if it @@ -2998,10 +3452,6 @@ must_activate_page: } } - XPR(XPR_VM_PAGEOUT, - "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n", - object, m->vmp_offset, m, 0, 0); - /* * we've got a candidate page to steal... * @@ -3045,81 +3495,22 @@ throttle_inactive: VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1); inactive_burst_count = 0; - goto done_with_inactivepage; + + lock_yield_check = TRUE; + continue; } if (inactive_throttled == TRUE) { - if (object->internal == FALSE) { - /* - * we need to break up the following potential deadlock case... - * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written. - * b) The thread doing the writing is waiting for pages while holding the truncate lock - * c) Most of the pages in the inactive queue belong to this file. - * - * we are potentially in this deadlock because... - * a) the external pageout queue is throttled - * b) we're done with the active queue and moved on to the inactive queue - * c) we've got a dirty external page - * - * since we don't know the reason for the external pageout queue being throttled we - * must suspect that we are deadlocked, so move the current page onto the active queue - * in an effort to cause a page from the active queue to 'age' to the inactive queue - * - * if we don't have jetsam configured (i.e. we have a dynamic pager), set - * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous - * pool the next time we select a victim page... if we can make enough new free pages, - * the deadlock will break, the external pageout queue will empty and it will no longer - * be throttled - * - * if we have jetsam configured, keep a count of the pages reactivated this way so - * that we can try to find clean pages in the active/inactive queues before - * deciding to jetsam a process - */ - vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++; - - vm_page_check_pageable_safe(m); - assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q); - vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq); - m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q; - vm_page_active_count++; - vm_page_pageable_external_count++; - - vm_pageout_adjust_eq_iothrottle(eq, FALSE); - -#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM - vm_pageout_inactive_external_forced_reactivate_limit--; - - if (vm_pageout_inactive_external_forced_reactivate_limit <= 0) { - vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; - /* - * Possible deadlock scenario so request jetsam action - */ - assert(object); - vm_object_unlock(object); - object = VM_OBJECT_NULL; - vm_page_unlock_queues(); + vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit, + &delayed_unlock, &force_anonymous, page_from_bg_q); - VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START, - vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count); - - /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */ - if (memorystatus_kill_on_VM_page_shortage(FALSE) == TRUE) { - VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1); - } - - VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END, - vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count); + inactive_burst_count = 0; - vm_page_lock_queues(); - delayed_unlock = 1; - } -#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */ - force_anonymous = TRUE; -#endif - inactive_burst_count = 0; - goto done_with_inactivepage; - } else { - goto must_activate_page; + if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1); } + + lock_yield_check = TRUE; + continue; } /* @@ -3261,21 +3652,6 @@ throttle_inactive: vm_pageout_cluster(m); inactive_burst_count = 0; -done_with_inactivepage: - - if (delayed_unlock++ > delayed_unlock_limit) { - int freed = local_freed; - - vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, - VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); - if (freed == 0) { - lck_mtx_yield(&vm_page_queue_lock); - } - } else if (vm_pageout_scan_wants_object) { - vm_page_unlock_queues(); - mutex_pause(0); - vm_page_lock_queues(); - } /* * back to top of pageout scan loop */ @@ -3335,11 +3711,9 @@ vm_pageout_continue(void) DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL); VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1); -#if !CONFIG_EMBEDDED lck_mtx_lock(&vm_page_queue_free_lock); vm_pageout_running = TRUE; lck_mtx_unlock(&vm_page_queue_free_lock); -#endif /* CONFIG_EMBEDDED */ vm_pageout_scan(); /* @@ -3350,23 +3724,22 @@ vm_pageout_continue(void) assert(vm_page_free_wanted_privileged == 0); assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT); -#if !CONFIG_EMBEDDED vm_pageout_running = FALSE; +#if XNU_TARGET_OS_OSX if (vm_pageout_waiter) { vm_pageout_waiter = FALSE; thread_wakeup((event_t)&vm_pageout_waiter); } -#endif /* !CONFIG_EMBEDDED */ +#endif /* XNU_TARGET_OS_OSX */ lck_mtx_unlock(&vm_page_queue_free_lock); vm_page_unlock_queues(); - counter(c_vm_pageout_block++); thread_block((thread_continue_t)vm_pageout_continue); /*NOTREACHED*/ } -#if !CONFIG_EMBEDDED +#if XNU_TARGET_OS_OSX kern_return_t vm_pageout_wait(uint64_t deadline) { @@ -3385,7 +3758,7 @@ vm_pageout_wait(uint64_t deadline) return kr; } -#endif /* !CONFIG_EMBEDDED */ +#endif /* XNU_TARGET_OS_OSX */ static void @@ -3559,7 +3932,16 @@ vm_pageout_iothread_internal_continue(struct cq *cq) KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0); q = cq->q; +#if __AMP__ + if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) { + local_batch_size = (q->pgo_maxlaundry >> 3); + local_batch_size = MAX(local_batch_size, 16); + } else { + local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2); + } +#else local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2); +#endif #if RECORD_THE_COMPRESSED_DATA if (q->pgo_laundry) { @@ -3847,7 +4229,7 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m) vm_object_owner_compressed_update(object, +1); } - VM_STAT_INCR(compressions); + counter_inc(&vm_statistics_compressions); if (m->vmp_tabled) { vm_page_remove(m, TRUE); @@ -3891,9 +4273,8 @@ vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *eq, boolean_t req_lowpr proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid, TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy); - eq->pgo_lowpriority = req_lowpriority; - vm_page_lock_queues(); + eq->pgo_lowpriority = req_lowpriority; } } @@ -3943,6 +4324,19 @@ vm_pageout_iothread_internal(struct cq *cq) thread_vm_bind_group_add(); } +#if CONFIG_THREAD_GROUPS + thread_group_vm_add(); +#endif /* CONFIG_THREAD_GROUPS */ + +#if __AMP__ + if (vm_compressor_ebound) { + /* + * Use the soft bound option for vm_compressor to allow it to run on + * P-cores if E-cluster is unavailable. + */ + thread_bind_cluster_type(self, 'E', true); + } +#endif /* __AMP__ */ thread_set_thread_name(current_thread(), "VM_compressor"); #if DEVELOPMENT || DEBUG @@ -3956,7 +4350,7 @@ vm_pageout_iothread_internal(struct cq *cq) kern_return_t vm_set_buffer_cleanup_callout(boolean_t (*func)(int)) { - if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) { + if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) { return KERN_SUCCESS; } else { return KERN_FAILURE; /* Already set */ @@ -3983,16 +4377,16 @@ vm_pressure_response(void) return; } -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX available_memory = (uint64_t) memorystatus_available_pages; -#else /* CONFIG_EMBEDDED */ +#else /* !XNU_TARGET_OS_OSX */ available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY; memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY; -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ total_pages = (unsigned int) atop_64(max_mem); #if CONFIG_SECLUDED_MEMORY @@ -4063,53 +4457,67 @@ vm_pressure_response(void) } #endif /* VM_PRESSURE_EVENTS */ +/* + * Function called by a kernel thread to either get the current pressure level or + * wait until memory pressure changes from a given level. + */ kern_return_t mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level) { -#if CONFIG_EMBEDDED - - return KERN_FAILURE; - -#elif !VM_PRESSURE_EVENTS +#if !VM_PRESSURE_EVENTS return KERN_FAILURE; #else /* VM_PRESSURE_EVENTS */ - kern_return_t kr = KERN_SUCCESS; + wait_result_t wr = 0; + vm_pressure_level_t old_level = memorystatus_vm_pressure_level; - if (pressure_level != NULL) { - vm_pressure_level_t old_level = memorystatus_vm_pressure_level; + if (pressure_level == NULL) { + return KERN_INVALID_ARGUMENT; + } - if (wait_for_pressure == TRUE) { - wait_result_t wr = 0; + if (*pressure_level == kVMPressureJetsam) { + if (!wait_for_pressure) { + return KERN_INVALID_ARGUMENT; + } - while (old_level == *pressure_level) { - wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed, - THREAD_INTERRUPTIBLE); - if (wr == THREAD_WAITING) { - wr = thread_block(THREAD_CONTINUE_NULL); - } - if (wr == THREAD_INTERRUPTED) { - return KERN_ABORTED; - } - if (wr == THREAD_AWAKENED) { - old_level = memorystatus_vm_pressure_level; + lck_mtx_lock(&memorystatus_jetsam_fg_band_lock); + wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters, + THREAD_INTERRUPTIBLE); + if (wr == THREAD_WAITING) { + ++memorystatus_jetsam_fg_band_waiters; + lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock); + wr = thread_block(THREAD_CONTINUE_NULL); + } else { + lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock); + } + if (wr != THREAD_AWAKENED) { + return KERN_ABORTED; + } + *pressure_level = kVMPressureJetsam; + return KERN_SUCCESS; + } - if (old_level != *pressure_level) { - break; - } - } + if (wait_for_pressure == TRUE) { + while (old_level == *pressure_level) { + wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed, + THREAD_INTERRUPTIBLE); + if (wr == THREAD_WAITING) { + wr = thread_block(THREAD_CONTINUE_NULL); + } + if (wr == THREAD_INTERRUPTED) { + return KERN_ABORTED; } - } - *pressure_level = old_level; - kr = KERN_SUCCESS; - } else { - kr = KERN_INVALID_ARGUMENT; + if (wr == THREAD_AWAKENED) { + old_level = memorystatus_vm_pressure_level; + } + } } - return kr; + *pressure_level = old_level; + return KERN_SUCCESS; #endif /* VM_PRESSURE_EVENTS */ } @@ -4163,17 +4571,15 @@ compute_pageout_gc_throttle(__unused void *arg) * * 2. The jetsam path might need to allocate zone memory itself. We could try * using the non-blocking variant of zalloc for this path, but we can still - * end up trying to do a kernel_memory_allocate when the zone_map is almost + * end up trying to do a kernel_memory_allocate when the zone maps are almost * full. */ -extern boolean_t is_zone_map_nearing_exhaustion(void); - void vm_pageout_garbage_collect(int collect) { if (collect) { - if (is_zone_map_nearing_exhaustion()) { + if (zone_map_nearing_exhaustion()) { /* * Woken up by the zone allocator for zone-map-exhaustion jetsams. * @@ -4191,7 +4597,7 @@ vm_pageout_garbage_collect(int collect) * ok; if memory pressure persists, the thread will simply be woken * up again. */ - consider_zone_gc(TRUE); + zone_gc(ZONE_GC_JETSAM); } else { /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */ boolean_t buf_large_zfree = FALSE; @@ -4208,10 +4614,10 @@ vm_pageout_garbage_collect(int collect) } if (first_try == TRUE || buf_large_zfree == TRUE) { /* - * consider_zone_gc should be last, because the other operations + * zone_gc should be last, because the other operations * might return memory to zones. */ - consider_zone_gc(FALSE); + zone_gc(ZONE_GC_TRIM); } first_try = FALSE; } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target); @@ -4236,36 +4642,37 @@ extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end; void -vm_set_restrictions() +vm_set_restrictions(unsigned int num_cpus) { - host_basic_info_data_t hinfo; - mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; - -#define BSD_HOST 1 - host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); + int vm_restricted_to_single_processor = 0; - assert(hinfo.max_cpus > 0); - - if (hinfo.max_cpus <= 3) { - /* - * on systems with a limited number of CPUS, bind the - * 4 major threads that can free memory and that tend to use - * a fair bit of CPU under pressured conditions to a single processor. - * This insures that these threads don't hog all of the available CPUs - * (important for camera launch), while allowing them to run independently - * w/r to locks... the 4 threads are - * vm_pageout_scan, vm_pageout_iothread_internal (compressor), - * vm_compressor_swap_trigger_thread (minor and major compactions), - * memorystatus_thread (jetsams). - * - * the first time the thread is run, it is responsible for checking the - * state of vm_restricted_to_single_processor, and if TRUE it calls - * thread_bind_master... someday this should be replaced with a group - * scheduling mechanism and KPI. - */ - vm_pageout_state.vm_restricted_to_single_processor = TRUE; + if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) { + kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor); + vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE); } else { - vm_pageout_state.vm_restricted_to_single_processor = FALSE; + assert(num_cpus > 0); + + if (num_cpus <= 3) { + /* + * on systems with a limited number of CPUS, bind the + * 4 major threads that can free memory and that tend to use + * a fair bit of CPU under pressured conditions to a single processor. + * This insures that these threads don't hog all of the available CPUs + * (important for camera launch), while allowing them to run independently + * w/r to locks... the 4 threads are + * vm_pageout_scan, vm_pageout_iothread_internal (compressor), + * vm_compressor_swap_trigger_thread (minor and major compactions), + * memorystatus_thread (jetsams). + * + * the first time the thread is run, it is responsible for checking the + * state of vm_restricted_to_single_processor, and if TRUE it calls + * thread_bind_master... someday this should be replaced with a group + * scheduling mechanism and KPI. + */ + vm_pageout_state.vm_restricted_to_single_processor = TRUE; + } else { + vm_pageout_state.vm_restricted_to_single_processor = FALSE; + } } } @@ -4282,18 +4689,65 @@ vm_pageout(void) */ s = splsched(); + vm_pageout_scan_thread = self; + +#if CONFIG_VPS_DYNAMIC_PRIO + + int vps_dynprio_bootarg = 0; + + if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) { + vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE); + kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled); + } else { + if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) { + vps_dynamic_priority_enabled = TRUE; + } else { + vps_dynamic_priority_enabled = FALSE; + } + } + + if (vps_dynamic_priority_enabled) { + sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE); + thread_set_eager_preempt(self); + } else { + sched_set_kernel_thread_priority(self, BASEPRI_VM); + } + +#else /* CONFIG_VPS_DYNAMIC_PRIO */ + + vps_dynamic_priority_enabled = FALSE; + sched_set_kernel_thread_priority(self, BASEPRI_VM); + +#endif /* CONFIG_VPS_DYNAMIC_PRIO */ + thread_lock(self); self->options |= TH_OPT_VMPRIV; - sched_set_thread_base_priority(self, BASEPRI_VM); thread_unlock(self); if (!self->reserved_stack) { self->reserved_stack = self->kernel_stack; } - if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) { - thread_vm_bind_group_add(); + if (vm_pageout_state.vm_restricted_to_single_processor == TRUE && + vps_dynamic_priority_enabled == FALSE) { + thread_vm_bind_group_add(); + } + + +#if CONFIG_THREAD_GROUPS + thread_group_vm_add(); +#endif /* CONFIG_THREAD_GROUPS */ + +#if __AMP__ + PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound)); + if (vm_pgo_pbound) { + /* + * Use the soft bound option for vm pageout to allow it to run on + * E-cores if P-cluster is unavailable. + */ + thread_bind_cluster_type(self, 'P', true); } +#endif /* __AMP__ */ splx(s); @@ -4412,15 +4866,24 @@ vm_pageout(void) if (result != KERN_SUCCESS) { panic("vm_pageout_iothread_external: create failed"); } - + thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread"); thread_deallocate(vm_pageout_state.vm_pageout_external_iothread); - result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, + result = kernel_thread_create((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_DEFAULT, &thread); if (result != KERN_SUCCESS) { panic("vm_pageout_garbage_collect: create failed"); } + thread_set_thread_name(thread, "VM_pageout_garbage_collect"); + if (thread->reserved_stack == 0) { + assert(thread->kernel_stack); + thread->reserved_stack = thread->kernel_stack; + } + + thread_mtx_lock(thread); + thread_start(thread); + thread_mtx_unlock(thread); thread_deallocate(thread); @@ -4444,6 +4907,7 @@ vm_pageout(void) switch (vm_compressor_mode) { case VM_PAGER_DEFAULT: printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n"); + OS_FALLTHROUGH; case VM_PAGER_COMPRESSOR_WITH_SWAP: vm_config.compressor_is_present = TRUE; @@ -4460,6 +4924,7 @@ vm_pageout(void) case VM_PAGER_FREEZER_DEFAULT: printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n"); + OS_FALLTHROUGH; case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP: vm_config.compressor_is_present = TRUE; @@ -4508,8 +4973,6 @@ vm_pageout(void) vm_object_tracking_init(); #endif /* VM_OBJECT_TRACKING */ - vm_tests(); - vm_pageout_continue(); /* @@ -4542,8 +5005,8 @@ kern_return_t vm_pageout_internal_start(void) { kern_return_t result; - int i; host_basic_info_data_t hinfo; + vm_offset_t buf, bufsize; assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); @@ -4553,20 +5016,24 @@ vm_pageout_internal_start(void) assert(hinfo.max_cpus > 0); - lck_grp_init(&vm_pageout_lck_grp, "vm_pageout", LCK_GRP_ATTR_NULL); - -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX vm_pageout_state.vm_compressor_thread_count = 1; -#else +#else /* !XNU_TARGET_OS_OSX */ if (hinfo.max_cpus > 4) { vm_pageout_state.vm_compressor_thread_count = 2; } else { vm_pageout_state.vm_compressor_thread_count = 1; } -#endif +#endif /* !XNU_TARGET_OS_OSX */ PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count, sizeof(vm_pageout_state.vm_compressor_thread_count)); +#if __AMP__ + PE_parse_boot_argn("vmcomp_ecluster", &vm_compressor_ebound, sizeof(vm_compressor_ebound)); + if (vm_compressor_ebound) { + vm_pageout_state.vm_compressor_thread_count = 2; + } +#endif if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) { vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1; } @@ -4576,18 +5043,30 @@ vm_pageout_internal_start(void) vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT; } - vm_pageout_queue_internal.pgo_maxlaundry = (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX; + vm_pageout_queue_internal.pgo_maxlaundry = + (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX; + + PE_parse_boot_argn("vmpgoi_maxlaundry", + &vm_pageout_queue_internal.pgo_maxlaundry, + sizeof(vm_pageout_queue_internal.pgo_maxlaundry)); - PE_parse_boot_argn("vmpgoi_maxlaundry", &vm_pageout_queue_internal.pgo_maxlaundry, sizeof(vm_pageout_queue_internal.pgo_maxlaundry)); + bufsize = COMPRESSOR_SCRATCH_BUF_SIZE; + if (kernel_memory_allocate(kernel_map, &buf, + bufsize * vm_pageout_state.vm_compressor_thread_count, + 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_COMPRESSOR)) { + panic("vm_pageout_internal_start: Unable to allocate %zd bytes", + (size_t)(bufsize * vm_pageout_state.vm_compressor_thread_count)); + } - for (i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) { + for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) { ciq[i].id = i; ciq[i].q = &vm_pageout_queue_internal; ciq[i].current_chead = NULL; - ciq[i].scratch_buf = kalloc(COMPRESSOR_SCRATCH_BUF_SIZE); + ciq[i].scratch_buf = (char *)(buf + i * bufsize); - result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, (void *)&ciq[i], - BASEPRI_VM, &vm_pageout_state.vm_pageout_internal_iothread); + result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, + (void *)&ciq[i], BASEPRI_VM, + &vm_pageout_state.vm_pageout_internal_iothread); if (result == KERN_SUCCESS) { thread_deallocate(vm_pageout_state.vm_pageout_internal_iothread); @@ -4651,6 +5130,8 @@ upl_create(int type, int flags, upl_size_t size) int upl_flags = 0; vm_size_t upl_size = sizeof(struct upl); + assert(page_aligned(size)); + size = round_page_32(size); if (type & UPL_CREATE_LITE) { @@ -4672,7 +5153,8 @@ upl_create(int type, int flags, upl_size_t size) upl->flags = upl_flags | flags; upl->kaddr = (vm_offset_t)0; - upl->size = 0; + upl->u_offset = 0; + upl->u_size = 0; upl->map_object = NULL; upl->ref_count = 1; upl->ext_ref_count = 0; @@ -4728,6 +5210,8 @@ upl_destroy(upl_t upl) int page_field_size; /* bit field in word size buf */ int size; +// DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object); + if (upl->ext_ref_count) { panic("upl(%p) ext_ref_count", upl); } @@ -4745,7 +5229,8 @@ upl_destroy(upl_t upl) #endif /* CONFIG_IOSCHED */ #if CONFIG_IOSCHED || UPL_DEBUG - if ((upl->flags & UPL_TRACKED_BY_OBJECT) && !(upl->flags & UPL_VECTOR)) { + if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) && + !(upl->flags & UPL_VECTOR)) { vm_object_t object; if (upl->flags & UPL_SHADOWED) { @@ -4772,7 +5257,7 @@ upl_destroy(upl_t upl) if (upl->flags & UPL_DEVICE_MEMORY) { size = PAGE_SIZE; } else { - size = upl->size; + size = upl_adjusted_size(upl, PAGE_MASK); } page_field_size = 0; @@ -4856,6 +5341,50 @@ must_throttle_writes() return FALSE; } +#define MIN_DELAYED_WORK_CTX_ALLOCATED (16) +#define MAX_DELAYED_WORK_CTX_ALLOCATED (512) + +int vm_page_delayed_work_ctx_needed = 0; +SECURITY_READ_ONLY_LATE(zone_t) dw_ctx_zone; + +void +vm_page_delayed_work_init_ctx(void) +{ + size_t elem_size = sizeof(struct vm_page_delayed_work_ctx); + + dw_ctx_zone = zone_create_ext("delayed-work-ctx", elem_size, + ZC_NOGC, ZONE_ID_ANY, ^(zone_t z) { + zone_set_exhaustible(z, MAX_DELAYED_WORK_CTX_ALLOCATED); + }); + + zone_fill_initially(dw_ctx_zone, MIN_DELAYED_WORK_CTX_ALLOCATED); +} + +struct vm_page_delayed_work* +vm_page_delayed_work_get_ctx(void) +{ + struct vm_page_delayed_work_ctx * dw_ctx = NULL; + + dw_ctx = (struct vm_page_delayed_work_ctx*) zalloc_noblock(dw_ctx_zone); + + if (dw_ctx) { + dw_ctx->delayed_owner = current_thread(); + } else { + vm_page_delayed_work_ctx_needed++; + } + return dw_ctx ? dw_ctx->dwp : NULL; +} + +void +vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp) +{ + struct vm_page_delayed_work_ctx *ldw_ctx; + + ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp; + ldw_ctx->delayed_owner = NULL; + + zfree(dw_ctx_zone, ldw_ctx); +} /* * Routine: vm_object_upl_request @@ -4924,8 +5453,9 @@ vm_object_upl_request( int refmod_state = 0; wpl_array_t lite_list = NULL; vm_object_t last_copy_object; - struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; - struct vm_page_delayed_work *dwp; + struct vm_page_delayed_work dw_array; + struct vm_page_delayed_work *dwp, *dwp_start; + bool dwp_finish_ctx = TRUE; int dw_count; int dw_limit; int io_tracking_flag = 0; @@ -4938,6 +5468,8 @@ vm_object_upl_request( task_t task = current_task(); #endif /* DEVELOPMENT || DEBUG */ + dwp_start = dwp = NULL; + if (cntrl_flags & ~UPL_VALID_FLAGS) { /* * For forward compatibility's sake, @@ -4952,8 +5484,23 @@ vm_object_upl_request( panic("vm_object_upl_request: contiguous object specified\n"); } + assertf(page_aligned(offset) && page_aligned(size), + "offset 0x%llx size 0x%x", + offset, size); + VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0); + dw_count = 0; + dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); + dwp_start = vm_page_delayed_work_get_ctx(); + if (dwp_start == NULL) { + dwp_start = &dw_array; + dw_limit = 1; + dwp_finish_ctx = FALSE; + } + + dwp = dwp_start; + if (size > MAX_UPL_SIZE_BYTES) { size = MAX_UPL_SIZE_BYTES; } @@ -5025,8 +5572,11 @@ vm_object_upl_request( upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; upl->map_object->vo_shadow_offset = offset; upl->map_object->wimg_bits = object->wimg_bits; + assertf(page_aligned(upl->map_object->vo_shadow_offset), + "object %p shadow_offset 0x%llx", + upl->map_object, upl->map_object->vo_shadow_offset); - VM_PAGE_GRAB_FICTITIOUS(alias_page); + alias_page = vm_page_grab_fictitious(TRUE); upl->flags |= UPL_SHADOWED; } @@ -5047,8 +5597,8 @@ vm_object_upl_request( /* * we can lock in the paging_offset once paging_in_progress is set */ - upl->size = size; - upl->offset = offset + object->paging_offset; + upl->u_size = size; + upl->u_offset = offset + object->paging_offset; #if CONFIG_IOSCHED || UPL_DEBUG if (object->io_tracking || upl_debug_enabled) { @@ -5088,10 +5638,6 @@ vm_object_upl_request( dst_offset = offset; size_in_pages = size / PAGE_SIZE; - dwp = &dw_array[0]; - dw_count = 0; - dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); - if (vm_page_free_count > (vm_page_free_target + size_in_pages) || object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) { object->scan_collisions = 0; @@ -5100,11 +5646,11 @@ vm_object_upl_request( if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) { boolean_t isSSD = FALSE; -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX isSSD = TRUE; -#else +#else /* !XNU_TARGET_OS_OSX */ vnode_pager_get_isSSD(object->pager, &isSSD); -#endif +#endif /* !XNU_TARGET_OS_OSX */ vm_object_unlock(object); OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages); @@ -5124,7 +5670,7 @@ vm_object_upl_request( if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) { vm_object_unlock(object); - VM_PAGE_GRAB_FICTITIOUS(alias_page); + alias_page = vm_page_grab_fictitious(TRUE); vm_object_lock(object); } if (cntrl_flags & UPL_COPYOUT_FROM) { @@ -5267,7 +5813,7 @@ check_busy: pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE); assert(pg_num == (dst_offset - offset) / PAGE_SIZE); - lite_list[pg_num >> 5] |= 1 << (pg_num & 31); + lite_list[pg_num >> 5] |= 1U << (pg_num & 31); if (hw_dirty) { if (pmap_flushes_delayed == FALSE) { @@ -5482,7 +6028,7 @@ check_busy: dst_page->vmp_clustered = TRUE; if (!(cntrl_flags & UPL_FILE_IO)) { - VM_STAT_INCR(pageins); + counter_inc(&vm_statistics_pageins); } } } @@ -5512,7 +6058,7 @@ check_busy: pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE); assert(pg_num == (dst_offset - offset) / PAGE_SIZE); - lite_list[pg_num >> 5] |= 1 << (pg_num & 31); + lite_list[pg_num >> 5] |= 1U << (pg_num & 31); if (hw_dirty) { pmap_clear_modify(phys_page); @@ -5542,7 +6088,22 @@ check_busy: upl->flags &= ~UPL_CLEAR_DIRTY; upl->flags |= UPL_SET_DIRTY; dirty = TRUE; - upl->flags |= UPL_SET_DIRTY; + /* + * Page belonging to a code-signed object is about to + * be written. Mark it tainted and disconnect it from + * all pmaps so processes have to fault it back in and + * deal with the tainted bit. + */ + if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) { + dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE; + vm_page_upl_tainted++; + if (dst_page->vmp_pmapped) { + refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); + if (refmod_state & VM_MEM_REFERENCED) { + dst_page->vmp_reference = TRUE; + } + } + } } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) { /* * clean in place for read implies @@ -5638,15 +6199,15 @@ check_busy: try_next_page: if (dwp->dw_mask) { if (dwp->dw_mask & DW_vm_page_activate) { - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); } VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count); if (dw_count >= dw_limit) { - vm_page_do_delayed_work(object, tag, &dw_array[0], dw_count); + vm_page_do_delayed_work(object, tag, dwp_start, dw_count); - dwp = &dw_array[0]; + dwp = dwp_start; dw_count = 0; } } @@ -5655,7 +6216,9 @@ try_next_page: xfer_size -= PAGE_SIZE; } if (dw_count) { - vm_page_do_delayed_work(object, tag, &dw_array[0], dw_count); + vm_page_do_delayed_work(object, tag, dwp_start, dw_count); + dwp = dwp_start; + dw_count = 0; } if (alias_page != NULL) { @@ -5684,6 +6247,11 @@ try_next_page: } #endif /* DEVELOPMENT || DEBUG */ + if (dwp_start && dwp_finish_ctx) { + vm_page_delayed_work_finish_ctx(dwp_start); + dwp_start = dwp = NULL; + } + return KERN_SUCCESS; } @@ -5751,11 +6319,9 @@ vm_object_super_upl_request( return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag); } -#if CONFIG_EMBEDDED int cs_executable_create_upl = 0; extern int proc_selfpid(void); extern char *proc_name_address(void *p); -#endif /* CONFIG_EMBEDDED */ kern_return_t vm_map_create_upl( @@ -5776,8 +6342,18 @@ vm_map_create_upl( vm_map_offset_t local_offset; vm_map_offset_t local_start; kern_return_t ret; + vm_map_address_t original_offset; + vm_map_size_t original_size, adjusted_size; + vm_map_offset_t local_entry_start; + vm_object_offset_t local_entry_offset; + vm_object_offset_t offset_in_mapped_page; + boolean_t release_map = FALSE; + +start_with_map: - assert(page_aligned(offset)); + original_offset = offset; + original_size = *upl_size; + adjusted_size = original_size; caller_flags = *flags; @@ -5786,13 +6362,15 @@ vm_map_create_upl( * For forward compatibility's sake, * reject any unknown flag. */ - return KERN_INVALID_VALUE; + ret = KERN_INVALID_VALUE; + goto done; } force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC); sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM); if (upl == NULL) { - return KERN_INVALID_ARGUMENT; + ret = KERN_INVALID_ARGUMENT; + goto done; } REDISCOVER_ENTRY: @@ -5800,12 +6378,22 @@ REDISCOVER_ENTRY: if (!vm_map_lookup_entry(map, offset, &entry)) { vm_map_unlock_read(map); - return KERN_FAILURE; + ret = KERN_FAILURE; + goto done; } - if ((entry->vme_end - offset) < *upl_size) { - *upl_size = (upl_size_t) (entry->vme_end - offset); - assert(*upl_size == entry->vme_end - offset); + local_entry_start = entry->vme_start; + local_entry_offset = VME_OFFSET(entry); + + if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { + DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags); + } + + if (entry->vme_end - original_offset < adjusted_size) { + adjusted_size = entry->vme_end - original_offset; + assert(adjusted_size > 0); + *upl_size = (upl_size_t) adjusted_size; + assert(*upl_size == adjusted_size); } if (caller_flags & UPL_QUERY_OBJECT_TYPE) { @@ -5822,7 +6410,22 @@ REDISCOVER_ENTRY: } } vm_map_unlock_read(map); - return KERN_SUCCESS; + ret = KERN_SUCCESS; + goto done; + } + + offset_in_mapped_page = 0; + if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) { + offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map)); + *upl_size = (upl_size_t) + (vm_map_round_page(original_offset + adjusted_size, + VM_MAP_PAGE_MASK(map)) + - offset); + + offset_in_mapped_page = original_offset - offset; + assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map)); + + DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page); } if (VME_OBJECT(entry) == VM_OBJECT_NULL || @@ -5842,8 +6445,7 @@ REDISCOVER_ENTRY: VME_OBJECT_SET(entry, vm_object_allocate((vm_size_t) - (entry->vme_end - - entry->vme_start))); + vm_object_round_page((entry->vme_end - entry->vme_start)))); VME_OFFSET_SET(entry, 0); assert(entry->use_pmap); @@ -5854,10 +6456,11 @@ REDISCOVER_ENTRY: !entry->is_sub_map && !(entry->protection & VM_PROT_WRITE)) { vm_map_unlock_read(map); - return KERN_PROTECTION_FAILURE; + ret = KERN_PROTECTION_FAILURE; + goto done; } -#if CONFIG_EMBEDDED +#if !XNU_TARGET_OS_OSX if (map->pmap != kernel_pmap && (caller_flags & UPL_COPYOUT_FROM) && (entry->protection & VM_PROT_EXECUTE) && @@ -5879,6 +6482,7 @@ REDISCOVER_ENTRY: */ vm_map_unlock_read(map); + entry = VM_MAP_ENTRY_NULL; /* allocate kernel buffer */ ksize = round_page(*upl_size); kaddr = 0; @@ -5888,7 +6492,6 @@ REDISCOVER_ENTRY: tag); if (ret == KERN_SUCCESS) { /* copyin the user data */ - assert(page_aligned(offset)); ret = copyinmap(map, offset, (void *)kaddr, *upl_size); } if (ret == KERN_SUCCESS) { @@ -5899,8 +6502,17 @@ REDISCOVER_ENTRY: ksize - *upl_size); } /* create the UPL from the kernel buffer */ - ret = vm_map_create_upl(kernel_map, kaddr, upl_size, - upl, page_list, count, flags, tag); + vm_object_offset_t offset_in_object; + vm_object_offset_t offset_in_object_page; + + offset_in_object = offset - local_entry_start + local_entry_offset; + offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object); + assert(offset_in_object_page < PAGE_SIZE); + assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE); + *upl_size -= offset_in_object_page + offset_in_mapped_page; + ret = vm_map_create_upl(kernel_map, + (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page), + upl_size, upl, page_list, count, flags, tag); } if (kaddr != 0) { /* free the kernel buffer */ @@ -5915,9 +6527,9 @@ REDISCOVER_ENTRY: upl_size_t, *upl_size, kern_return_t, ret); #endif /* DEVELOPMENT || DEBUG */ - return ret; + goto done; } -#endif /* CONFIG_EMBEDDED */ +#endif /* !XNU_TARGET_OS_OSX */ local_object = VME_OBJECT(entry); assert(local_object != VM_OBJECT_NULL); @@ -5982,6 +6594,7 @@ REDISCOVER_ENTRY: map->mapped_in_other_pmaps) ? PMAP_NULL : map->pmap), + VM_MAP_PAGE_SIZE(map), entry->vme_start, prot); @@ -6038,16 +6651,17 @@ REDISCOVER_ENTRY: &version, &object, &new_offset, &prot, &wired, NULL, - &real_map) != KERN_SUCCESS) { + &real_map, NULL) != KERN_SUCCESS) { if (fault_type == VM_PROT_WRITE) { vm_counters.create_upl_lookup_failure_write++; } else { vm_counters.create_upl_lookup_failure_copy++; } vm_map_unlock_read(local_map); - return KERN_FAILURE; + ret = KERN_FAILURE; + goto done; } - if (real_map != map) { + if (real_map != local_map) { vm_map_unlock(real_map); } vm_map_unlock_read(local_map); @@ -6062,17 +6676,22 @@ REDISCOVER_ENTRY: submap = VME_SUBMAP(entry); local_start = entry->vme_start; - local_offset = VME_OFFSET(entry); + local_offset = (vm_map_offset_t)VME_OFFSET(entry); vm_map_reference(submap); vm_map_unlock_read(map); - ret = vm_map_create_upl(submap, - local_offset + (offset - local_start), - upl_size, upl, page_list, count, flags, tag); - vm_map_deallocate(submap); + DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap); + offset += offset_in_mapped_page; + *upl_size -= offset_in_mapped_page; - return ret; + if (release_map) { + vm_map_deallocate(map); + } + map = submap; + release_map = TRUE; + offset = local_offset + (offset - local_start); + goto start_with_map; } if (sync_cow_data && @@ -6080,7 +6699,7 @@ REDISCOVER_ENTRY: VME_OBJECT(entry)->copy)) { local_object = VME_OBJECT(entry); local_start = entry->vme_start; - local_offset = VME_OFFSET(entry); + local_offset = (vm_map_offset_t)VME_OFFSET(entry); vm_object_reference(local_object); vm_map_unlock_read(map); @@ -6103,7 +6722,7 @@ REDISCOVER_ENTRY: if (force_data_sync) { local_object = VME_OBJECT(entry); local_start = entry->vme_start; - local_offset = VME_OFFSET(entry); + local_offset = (vm_map_offset_t)VME_OFFSET(entry); vm_object_reference(local_object); vm_map_unlock_read(map); @@ -6133,10 +6752,9 @@ REDISCOVER_ENTRY: } local_object = VME_OBJECT(entry); - local_offset = VME_OFFSET(entry); + local_offset = (vm_map_offset_t)VME_OFFSET(entry); local_start = entry->vme_start; -#if CONFIG_EMBEDDED /* * Wiring will copy the pages to the shadow object. * The shadow object will not be code-signed so @@ -6160,7 +6778,6 @@ REDISCOVER_ENTRY: uint64_t, (uint64_t)entry->vme_end); cs_executable_create_upl++; } -#endif /* CONFIG_EMBEDDED */ vm_object_lock(local_object); @@ -6203,6 +6820,10 @@ REDISCOVER_ENTRY: vm_map_unlock_read(map); + offset += offset_in_mapped_page; + assert(*upl_size > offset_in_mapped_page); + *upl_size -= offset_in_mapped_page; + ret = vm_object_iopl_request(local_object, ((vm_object_offset_t) ((offset - local_start) + local_offset)), @@ -6214,6 +6835,11 @@ REDISCOVER_ENTRY: tag); vm_object_deallocate(local_object); +done: + if (release_map) { + vm_map_deallocate(map); + } + return ret; } @@ -6245,6 +6871,9 @@ vm_map_enter_upl( return KERN_INVALID_ARGUMENT; } + DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx size 0x%x \n", map, upl, upl->flags, upl->map_object, upl->u_offset, upl->u_size); + assert(map == kernel_map); + if ((isVectorUPL = vector_upl_is_valid(upl))) { int mapped = 0, valid_upls = 0; vector_upl = upl; @@ -6270,7 +6899,13 @@ vm_map_enter_upl( } } - kr = kmem_suballoc(map, &vector_upl_dst_addr, vector_upl->size, FALSE, + if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) { + panic("TODO4K: vector UPL not implemented"); + } + + kr = kmem_suballoc(map, &vector_upl_dst_addr, + vector_upl->u_size, + FALSE, VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE, &vector_upl_submap); if (kr != KERN_SUCCESS) { @@ -6306,6 +6941,9 @@ process_upl_to_enter: return KERN_FAILURE; } } + + size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)); + if ((!(upl->flags & UPL_SHADOWED)) && ((upl->flags & UPL_HAS_BUSY) || !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) { @@ -6318,12 +6956,12 @@ process_upl_to_enter: if (upl->flags & UPL_INTERNAL) { lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl)) - + ((upl->size / PAGE_SIZE) * sizeof(upl_page_info_t))); + + ((size / PAGE_SIZE) * sizeof(upl_page_info_t))); } else { lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl)); } object = upl->map_object; - upl->map_object = vm_object_allocate(upl->size); + upl->map_object = vm_object_allocate(vm_object_round_page(size)); vm_object_lock(upl->map_object); @@ -6331,11 +6969,18 @@ process_upl_to_enter: upl->map_object->pageout = TRUE; upl->map_object->can_persist = FALSE; upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; - upl->map_object->vo_shadow_offset = upl->offset - object->paging_offset; + upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset; + assertf(page_aligned(upl->map_object->vo_shadow_offset), + "object %p shadow_offset 0x%llx", + upl->map_object, + (uint64_t)upl->map_object->vo_shadow_offset); upl->map_object->wimg_bits = object->wimg_bits; + assertf(page_aligned(upl->map_object->vo_shadow_offset), + "object %p shadow_offset 0x%llx", + upl->map_object, upl->map_object->vo_shadow_offset); offset = upl->map_object->vo_shadow_offset; new_offset = 0; - size = upl->size; + size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)); upl->flags |= UPL_SHADOWED; @@ -6343,8 +6988,8 @@ process_upl_to_enter: pg_num = (unsigned int) (new_offset / PAGE_SIZE); assert(pg_num == new_offset / PAGE_SIZE); - if (lite_list[pg_num >> 5] & (1 << (pg_num & 31))) { - VM_PAGE_GRAB_FICTITIOUS(alias_page); + if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) { + alias_page = vm_page_grab_fictitious(TRUE); vm_object_lock(object); @@ -6390,10 +7035,10 @@ process_upl_to_enter: if (upl->flags & UPL_SHADOWED) { offset = 0; } else { - offset = upl->offset - upl->map_object->paging_offset; + offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset; } - size = upl->size; + size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)); vm_object_reference(upl->map_object); @@ -6458,6 +7103,17 @@ process_upl_to_enter: goto process_upl_to_enter; } + if (!isVectorUPL) { + vm_map_offset_t addr_adjustment; + + addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map))); + if (addr_adjustment) { + assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK); + DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment)); + *dst_addr += addr_adjustment; + } + } + upl_unlock(upl); return KERN_SUCCESS; @@ -6522,7 +7178,9 @@ process_upl_to_remove: vm_offset_t v_upl_submap_dst_addr; vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr); - vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_REMOVE_NO_FLAGS); + vm_map_remove(map, v_upl_submap_dst_addr, + v_upl_submap_dst_addr + vector_upl->u_size, + VM_MAP_REMOVE_NO_FLAGS); vm_map_deallocate(v_upl_submap); upl_unlock(vector_upl); return KERN_SUCCESS; @@ -6536,7 +7194,7 @@ process_upl_to_remove: if (upl->flags & UPL_PAGE_LIST_MAPPED) { addr = upl->kaddr; - size = upl->size; + size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)); assert(upl->ref_count > 1); upl->ref_count--; /* removing mapping ref */ @@ -6580,7 +7238,7 @@ upl_commit_range( mach_msg_type_number_t count, boolean_t *empty) { - upl_size_t xfer_size, subupl_size = size; + upl_size_t xfer_size, subupl_size; vm_object_t shadow_object; vm_object_t object; vm_object_t m_object; @@ -6591,8 +7249,9 @@ upl_commit_range( int occupied; int clear_refmod = 0; int pgpgout_count = 0; - struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; - struct vm_page_delayed_work *dwp; + struct vm_page_delayed_work dw_array; + struct vm_page_delayed_work *dwp, *dwp_start; + bool dwp_finish_ctx = TRUE; int dw_count; int dw_limit; int isVectorUPL = 0; @@ -6606,13 +7265,31 @@ upl_commit_range( int unwired_count = 0; int local_queue_count = 0; vm_page_t first_local, last_local; + vm_object_offset_t obj_start, obj_end, obj_offset; + kern_return_t kr = KERN_SUCCESS; + +// DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags); + + dwp_start = dwp = NULL; + subupl_size = size; *empty = FALSE; if (upl == UPL_NULL) { return KERN_INVALID_ARGUMENT; } + dw_count = 0; + dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); + dwp_start = vm_page_delayed_work_get_ctx(); + if (dwp_start == NULL) { + dwp_start = &dw_array; + dw_limit = 1; + dwp_finish_ctx = FALSE; + } + + dwp = dwp_start; + if (count == 0) { page_list = NULL; } @@ -6631,12 +7308,14 @@ process_upl_to_commit: offset = subupl_offset; if (size == 0) { upl_unlock(vector_upl); - return KERN_SUCCESS; + kr = KERN_SUCCESS; + goto done; } upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size); if (upl == NULL) { upl_unlock(vector_upl); - return KERN_FAILURE; + kr = KERN_FAILURE; + goto done; } page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl); subupl_size -= size; @@ -6655,7 +7334,7 @@ process_upl_to_commit: #endif if (upl->flags & UPL_DEVICE_MEMORY) { xfer_size = 0; - } else if ((offset + size) <= upl->size) { + } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) { xfer_size = size; } else { if (!isVectorUPL) { @@ -6663,7 +7342,9 @@ process_upl_to_commit: } else { upl_unlock(vector_upl); } - return KERN_FAILURE; + DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size); + kr = KERN_FAILURE; + goto done; } if (upl->flags & UPL_SET_DIRTY) { flags |= UPL_COMMIT_SET_DIRTY; @@ -6674,7 +7355,7 @@ process_upl_to_commit: if (upl->flags & UPL_INTERNAL) { lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl)) - + ((upl->size / PAGE_SIZE) * sizeof(upl_page_info_t))); + + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t))); } else { lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl)); } @@ -6690,9 +7371,6 @@ process_upl_to_commit: entry = offset / PAGE_SIZE; target_offset = (vm_object_offset_t)offset; - assert(!(target_offset & PAGE_MASK)); - assert(!(xfer_size & PAGE_MASK)); - if (upl->flags & UPL_KERNEL_OBJECT) { vm_object_lock_shared(shadow_object); } else { @@ -6727,10 +7405,6 @@ process_upl_to_commit: should_be_throttled = TRUE; } - dwp = &dw_array[0]; - dw_count = 0; - dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); - if ((upl->flags & UPL_IO_WIRE) && !(flags & UPL_COMMIT_FREE_ABSENT) && !isVectorUPL && @@ -6754,7 +7428,13 @@ process_upl_to_commit: first_local = VM_PAGE_NULL; last_local = VM_PAGE_NULL; - while (xfer_size) { + obj_start = target_offset + upl->u_offset - shadow_object->paging_offset; + obj_end = obj_start + xfer_size; + obj_start = vm_object_trunc_page(obj_start); + obj_end = vm_object_round_page(obj_end); + for (obj_offset = obj_start; + obj_offset < obj_end; + obj_offset += PAGE_SIZE) { vm_page_t t, m; dwp->dw_mask = 0; @@ -6773,11 +7453,11 @@ process_upl_to_commit: pg_num = (unsigned int) (target_offset / PAGE_SIZE); assert(pg_num == target_offset / PAGE_SIZE); - if (lite_list[pg_num >> 5] & (1 << (pg_num & 31))) { - lite_list[pg_num >> 5] &= ~(1 << (pg_num & 31)); + if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) { + lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31)); if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) { - m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset)); + m = vm_page_lookup(shadow_object, obj_offset); } } else { m = NULL; @@ -6813,9 +7493,9 @@ process_upl_to_commit: * Set the code signing bits according to * what the UPL says they should be. */ - m->vmp_cs_validated = page_list[entry].cs_validated; - m->vmp_cs_tainted = page_list[entry].cs_tainted; - m->vmp_cs_nx = page_list[entry].cs_nx; + m->vmp_cs_validated |= page_list[entry].cs_validated; + m->vmp_cs_tainted |= page_list[entry].cs_tainted; + m->vmp_cs_nx |= page_list[entry].cs_nx; } if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) { m->vmp_written_by_kernel = TRUE; @@ -6832,7 +7512,8 @@ process_upl_to_commit: m->vmp_dirty = FALSE; if (!(flags & UPL_COMMIT_CS_VALIDATED) && - m->vmp_cs_validated && !m->vmp_cs_tainted) { + m->vmp_cs_validated && + m->vmp_cs_tainted != VMP_CS_ALL_TRUE) { /* * CODE SIGNING: * This page is no longer dirty @@ -6840,7 +7521,7 @@ process_upl_to_commit: * so it will need to be * re-validated. */ - m->vmp_cs_validated = FALSE; + m->vmp_cs_validated = VMP_CS_ALL_FALSE; VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1); @@ -6965,7 +7646,8 @@ process_upl_to_commit: } if (!(flags & UPL_COMMIT_CS_VALIDATED) && - m->vmp_cs_validated && !m->vmp_cs_tainted) { + m->vmp_cs_validated && + m->vmp_cs_tainted != VMP_CS_ALL_TRUE) { /* * CODE SIGNING: * This page is no longer dirty @@ -6973,7 +7655,7 @@ process_upl_to_commit: * so it will need to be * re-validated. */ - m->vmp_cs_validated = FALSE; + m->vmp_cs_validated = VMP_CS_ALL_FALSE; VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1); @@ -7009,10 +7691,17 @@ process_upl_to_commit: if (m->vmp_free_when_done) { /* * With the clean queue enabled, UPL_PAGEOUT should - * no longer set the pageout bit. It's pages now go + * no longer set the pageout bit. Its pages now go * to the clean queue. + * + * We don't use the cleaned Q anymore and so this + * assert isn't correct. The code for the clean Q + * still exists and might be used in the future. If we + * go back to the cleaned Q, we will re-enable this + * assert. + * + * assert(!(upl->flags & UPL_PAGEOUT)); */ - assert(!(flags & UPL_PAGEOUT)); assert(!m_object->internal); m->vmp_free_when_done = FALSE; @@ -7030,7 +7719,7 @@ process_upl_to_commit: dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP; if (upl->flags & UPL_PAGEOUT) { - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL); } } else { @@ -7071,7 +7760,7 @@ process_upl_to_commit: if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) { pgpgout_count++; - VM_STAT_INCR(pageouts); + counter_inc(&vm_statistics_pageouts); DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL); dwp->dw_mask |= DW_enqueue_cleaned; @@ -7126,9 +7815,9 @@ commit_next_page: VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count); if (dw_count >= dw_limit) { - vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count); + vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count); - dwp = &dw_array[0]; + dwp = dwp_start; dw_count = 0; } } else { @@ -7143,7 +7832,9 @@ commit_next_page: } } if (dw_count) { - vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count); + vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count); + dwp = dwp_start; + dw_count = 0; } if (fast_path_possible) { @@ -7231,7 +7922,7 @@ commit_next_page: occupied = 0; if (!fast_path_full_commit) { - pg_num = upl->size / PAGE_SIZE; + pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE; pg_num = (pg_num + 31) >> 5; for (i = 0; i < pg_num; i++) { @@ -7304,7 +7995,14 @@ commit_next_page: DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL); } - return KERN_SUCCESS; + kr = KERN_SUCCESS; +done: + if (dwp_start && dwp_finish_ctx) { + vm_page_delayed_work_finish_ctx(dwp_start); + dwp_start = dwp = NULL; + } + + return kr; } kern_return_t @@ -7316,7 +8014,7 @@ upl_abort_range( boolean_t *empty) { upl_page_info_t *user_page_list = NULL; - upl_size_t xfer_size, subupl_size = size; + upl_size_t xfer_size, subupl_size; vm_object_t shadow_object; vm_object_t object; vm_object_offset_t target_offset; @@ -7324,13 +8022,21 @@ upl_abort_range( int entry; wpl_array_t lite_list; int occupied; - struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; - struct vm_page_delayed_work *dwp; + struct vm_page_delayed_work dw_array; + struct vm_page_delayed_work *dwp, *dwp_start; + bool dwp_finish_ctx = TRUE; int dw_count; int dw_limit; int isVectorUPL = 0; upl_t vector_upl = NULL; + vm_object_offset_t obj_start, obj_end, obj_offset; + kern_return_t kr = KERN_SUCCESS; + +// DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error); + dwp_start = dwp = NULL; + + subupl_size = size; *empty = FALSE; if (upl == UPL_NULL) { @@ -7341,6 +8047,17 @@ upl_abort_range( return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty); } + dw_count = 0; + dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); + dwp_start = vm_page_delayed_work_get_ctx(); + if (dwp_start == NULL) { + dwp_start = &dw_array; + dw_limit = 1; + dwp_finish_ctx = FALSE; + } + + dwp = dwp_start; + if ((isVectorUPL = vector_upl_is_valid(upl))) { vector_upl = upl; upl_lock(vector_upl); @@ -7354,12 +8071,14 @@ process_upl_to_abort: offset = subupl_offset; if (size == 0) { upl_unlock(vector_upl); - return KERN_SUCCESS; + kr = KERN_SUCCESS; + goto done; } upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size); if (upl == NULL) { upl_unlock(vector_upl); - return KERN_FAILURE; + kr = KERN_FAILURE; + goto done; } subupl_size -= size; subupl_offset += size; @@ -7380,7 +8099,7 @@ process_upl_to_abort: #endif if (upl->flags & UPL_DEVICE_MEMORY) { xfer_size = 0; - } else if ((offset + size) <= upl->size) { + } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) { xfer_size = size; } else { if (!isVectorUPL) { @@ -7388,13 +8107,14 @@ process_upl_to_abort: } else { upl_unlock(vector_upl); } - - return KERN_FAILURE; + DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size); + kr = KERN_FAILURE; + goto done; } if (upl->flags & UPL_INTERNAL) { lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl)) - + ((upl->size / PAGE_SIZE) * sizeof(upl_page_info_t))); + + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t))); user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl)); } else { @@ -7413,9 +8133,6 @@ process_upl_to_abort: entry = offset / PAGE_SIZE; target_offset = (vm_object_offset_t)offset; - assert(!(target_offset & PAGE_MASK)); - assert(!(xfer_size & PAGE_MASK)); - if (upl->flags & UPL_KERNEL_OBJECT) { vm_object_lock_shared(shadow_object); } else { @@ -7428,15 +8145,17 @@ process_upl_to_abort: vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED); } - dwp = &dw_array[0]; - dw_count = 0; - dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); - if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) { panic("upl_abort_range: kernel_object being DUMPED"); } - while (xfer_size) { + obj_start = target_offset + upl->u_offset - shadow_object->paging_offset; + obj_end = obj_start + xfer_size; + obj_start = vm_object_trunc_page(obj_start); + obj_end = vm_object_round_page(obj_end); + for (obj_offset = obj_start; + obj_offset < obj_end; + obj_offset += PAGE_SIZE) { vm_page_t t, m; unsigned int pg_num; boolean_t needed; @@ -7454,12 +8173,11 @@ process_upl_to_abort: m = VM_PAGE_NULL; if (upl->flags & UPL_LITE) { - if (lite_list[pg_num >> 5] & (1 << (pg_num & 31))) { - lite_list[pg_num >> 5] &= ~(1 << (pg_num & 31)); + if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) { + lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31)); if (!(upl->flags & UPL_KERNEL_OBJECT)) { - m = vm_page_lookup(shadow_object, target_offset + - (upl->offset - shadow_object->paging_offset)); + m = vm_page_lookup(shadow_object, obj_offset); } } } @@ -7610,9 +8328,9 @@ abort_next_page: VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count); if (dw_count >= dw_limit) { - vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count); + vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count); - dwp = &dw_array[0]; + dwp = dwp_start; dw_count = 0; } } else { @@ -7627,7 +8345,9 @@ abort_next_page: } } if (dw_count) { - vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count); + vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count); + dwp = dwp_start; + dw_count = 0; } occupied = 1; @@ -7638,7 +8358,7 @@ abort_next_page: int pg_num; int i; - pg_num = upl->size / PAGE_SIZE; + pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE; pg_num = (pg_num + 31) >> 5; occupied = 0; @@ -7707,7 +8427,15 @@ abort_next_page: goto process_upl_to_abort; } - return KERN_SUCCESS; + kr = KERN_SUCCESS; + +done: + if (dwp_start && dwp_finish_ctx) { + vm_page_delayed_work_finish_ctx(dwp_start); + dwp_start = dwp = NULL; + } + + return kr; } @@ -7722,7 +8450,7 @@ upl_abort( return KERN_INVALID_ARGUMENT; } - return upl_abort_range(upl, 0, upl->size, error, &empty); + return upl_abort_range(upl, 0, upl->u_size, error, &empty); } @@ -7739,7 +8467,8 @@ upl_commit( return KERN_INVALID_ARGUMENT; } - return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty); + return upl_commit_range(upl, 0, upl->u_size, 0, + page_list, count, &empty); } @@ -7776,7 +8505,7 @@ iopl_valid_data( object, object->purgable); } - size = upl->size; + size = upl_adjusted_size(upl, PAGE_MASK); vm_object_lock(object); VM_OBJECT_WIRED_PAGE_UPDATE_START(object); @@ -7784,7 +8513,7 @@ iopl_valid_data( if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) { nxt_page = (vm_page_t)vm_page_queue_first(&object->memq); } else { - offset = 0 + upl->offset - object->paging_offset; + offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset); } while (size) { @@ -7914,7 +8643,7 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us } entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE); assert(entry >= 0 && entry < object->resident_page_count); - lite_list[entry >> 5] |= 1 << (entry & 31); + lite_list[entry >> 5] |= 1U << (entry & 31); phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); @@ -8039,7 +8768,7 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update); - lite_list[entry >> 5] |= 1 << (entry & 31); + lite_list[entry >> 5] |= 1U << (entry & 31); phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); @@ -8139,8 +8868,9 @@ vm_object_iopl_request( kern_return_t ret; vm_prot_t prot; struct vm_object_fault_info fault_info = {}; - struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; - struct vm_page_delayed_work *dwp; + struct vm_page_delayed_work dw_array; + struct vm_page_delayed_work *dwp, *dwp_start; + bool dwp_finish_ctx = TRUE; int dw_count; int dw_limit; int dw_index; @@ -8158,6 +8888,19 @@ vm_object_iopl_request( task_t task = current_task(); #endif /* DEVELOPMENT || DEBUG */ + dwp_start = dwp = NULL; + + vm_object_offset_t original_offset = offset; + upl_size_t original_size = size; + +// DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags); + + size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset)); + offset = vm_object_trunc_page(offset); + if (size != original_size || offset != original_offset) { + DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size); + } + if (cntrl_flags & ~UPL_VALID_FLAGS) { /* * For forward compatibility's sake, @@ -8198,6 +8941,7 @@ vm_object_iopl_request( panic("vm_object_iopl_request: external object with non-zero paging offset\n"); } + VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0); #if CONFIG_IOSCHED || UPL_DEBUG @@ -8219,6 +8963,17 @@ vm_object_iopl_request( psize = PAGE_SIZE; } else { psize = size; + + dw_count = 0; + dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); + dwp_start = vm_page_delayed_work_get_ctx(); + if (dwp_start == NULL) { + dwp_start = &dw_array; + dw_limit = 1; + dwp_finish_ctx = FALSE; + } + + dwp = dwp_start; } if (cntrl_flags & UPL_SET_INTERNAL) { @@ -8253,7 +9008,8 @@ vm_object_iopl_request( } upl->map_object = object; - upl->size = size; + upl->u_offset = original_offset; + upl->u_size = original_size; size_in_pages = size / PAGE_SIZE; @@ -8272,7 +9028,7 @@ vm_object_iopl_request( /* * paging in progress also protects the paging_offset */ - upl->offset = offset + object->paging_offset; + upl->u_offset = original_offset + object->paging_offset; if (cntrl_flags & UPL_BLOCK_ACCESS) { /* @@ -8283,7 +9039,7 @@ vm_object_iopl_request( } #if CONFIG_IOSCHED || UPL_DEBUG - if (upl->flags & UPL_TRACKED_BY_OBJECT) { + if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) { vm_object_activity_begin(object); queue_enter(&object->uplq, upl, upl_t, uplq); } @@ -8413,7 +9169,6 @@ vm_object_iopl_request( xfer_size = size; dst_offset = offset; - dw_count = 0; if (fast_path_full_req) { if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags, tag) == TRUE) { @@ -8444,9 +9199,6 @@ vm_object_iopl_request( fault_info.interruptible = interruptible; fault_info.batch_pmap_op = TRUE; - dwp = &dw_array[0]; - dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT); - while (xfer_size) { vm_fault_return_t result; @@ -8579,10 +9331,11 @@ vm_object_iopl_request( VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1); - /* fall thru */ + OS_FALLTHROUGH; case VM_FAULT_INTERRUPTED: error_code = MACH_SEND_INTERRUPTED; + OS_FALLTHROUGH; case VM_FAULT_MEMORY_ERROR: memory_error: ret = (error_code ? error_code: KERN_MEMORY_ERROR); @@ -8719,6 +9472,22 @@ memory_error: if (!(cntrl_flags & UPL_COPYOUT_FROM)) { SET_PAGE_DIRTY(dst_page, TRUE); + /* + * Page belonging to a code-signed object is about to + * be written. Mark it tainted and disconnect it from + * all pmaps so processes have to fault it back in and + * deal with the tainted bit. + */ + if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) { + dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE; + vm_page_iopl_tainted++; + if (dst_page->vmp_pmapped) { + int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); + if (refmod & VM_MEM_REFERENCED) { + dst_page->vmp_reference = TRUE; + } + } + } } if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) { pmap_sync_page_attributes_phys(phys_page); @@ -8730,7 +9499,7 @@ record_phys_addr: upl->flags |= UPL_HAS_BUSY; } - lite_list[entry >> 5] |= 1 << (entry & 31); + lite_list[entry >> 5] |= 1U << (entry & 31); if (phys_page > upl->highest_page) { upl->highest_page = phys_page; @@ -8773,9 +9542,9 @@ skip_page: VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count); if (dw_count >= dw_limit) { - vm_page_do_delayed_work(object, tag, &dw_array[0], dw_count); + vm_page_do_delayed_work(object, tag, dwp_start, dw_count); - dwp = &dw_array[0]; + dwp = dwp_start; dw_count = 0; } } @@ -8783,7 +9552,9 @@ skip_page: assert(entry == size_in_pages); if (dw_count) { - vm_page_do_delayed_work(object, tag, &dw_array[0], dw_count); + vm_page_do_delayed_work(object, tag, dwp_start, dw_count); + dwp = dwp_start; + dw_count = 0; } finish: if (user_page_list && set_cache_attr_needed == TRUE) { @@ -8807,7 +9578,9 @@ finish: * can't be accessed without causing a page fault. */ vm_object_pmap_protect(object, offset, (vm_object_size_t)size, - PMAP_NULL, 0, VM_PROT_NONE); + PMAP_NULL, + PAGE_SIZE, + 0, VM_PROT_NONE); assert(!object->blocked_access); object->blocked_access = TRUE; } @@ -8818,6 +9591,12 @@ finish: ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count); } #endif /* DEVELOPMENT || DEBUG */ + + if (dwp_start && dwp_finish_ctx) { + vm_page_delayed_work_finish_ctx(dwp_start); + dwp_start = dwp = NULL; + } + return KERN_SUCCESS; return_err: @@ -8845,7 +9624,7 @@ return_err: need_unwire = TRUE; if (dw_count) { - if (dw_array[dw_index].dw_m == dst_page) { + if ((dwp_start)[dw_index].dw_m == dst_page) { /* * still in the deferred work list * which means we haven't yet called @@ -8873,7 +9652,7 @@ return_err: vm_page_unlock_queues(); if (need_unwire == TRUE) { - VM_STAT_INCR(reactivations); + counter_inc(&vm_statistics_reactivations); } } #if UPL_DEBUG @@ -8892,6 +9671,11 @@ return_err: ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count); } #endif /* DEVELOPMENT || DEBUG */ + + if (dwp_start && dwp_finish_ctx) { + vm_page_delayed_work_finish_ctx(dwp_start); + dwp_start = dwp = NULL; + } return ret; } @@ -8926,8 +9710,8 @@ upl_transpose( object1 = upl1->map_object; object2 = upl2->map_object; - if (upl1->offset != 0 || upl2->offset != 0 || - upl1->size != upl2->size) { + if (upl1->u_offset != 0 || upl2->u_offset != 0 || + upl1->u_size != upl2->u_size) { /* * We deal only with full objects, not subsets. * That's because we exchange the entire backing store info @@ -8942,7 +9726,7 @@ upl_transpose( * Tranpose the VM objects' backing store. */ retval = vm_object_transpose(object1, object2, - (vm_object_size_t) upl1->size); + upl_adjusted_size(upl1, PAGE_MASK)); if (retval == KERN_SUCCESS) { /* @@ -8954,10 +9738,10 @@ upl_transpose( vm_object_lock(object1); vm_object_lock(object2); } - if (upl1->flags & UPL_TRACKED_BY_OBJECT) { + if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) { queue_remove(&object1->uplq, upl1, upl_t, uplq); } - if (upl2->flags & UPL_TRACKED_BY_OBJECT) { + if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) { queue_remove(&object2->uplq, upl2, upl_t, uplq); } #endif @@ -8965,10 +9749,10 @@ upl_transpose( upl2->map_object = object1; #if CONFIG_IOSCHED || UPL_DEBUG - if (upl1->flags & UPL_TRACKED_BY_OBJECT) { + if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) { queue_enter(&object2->uplq, upl1, upl_t, uplq); } - if (upl2->flags & UPL_TRACKED_BY_OBJECT) { + if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) { queue_enter(&object1->uplq, upl2, upl_t, uplq); } if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) { @@ -9004,7 +9788,7 @@ upl_range_needed( return; } - size_in_pages = upl->size / PAGE_SIZE; + size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE; user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl)); @@ -9023,7 +9807,7 @@ upl_range_needed( * virtaul address space each time we need to work with * a physical page. */ -decl_simple_lock_data(, vm_paging_lock) +SIMPLE_LOCK_DECLARE(vm_paging_lock, 0); #define VM_PAGING_NUM_PAGES 64 vm_map_offset_t vm_paging_base_address = 0; boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, }; @@ -9037,6 +9821,7 @@ unsigned long vm_paging_pages_mapped = 0; unsigned long vm_paging_objects_mapped_slow = 0; unsigned long vm_paging_pages_mapped_slow = 0; +__startup_func void vm_paging_map_init(void) { @@ -9107,20 +9892,10 @@ vm_paging_map_object( if (page != VM_PAGE_NULL && *size == PAGE_SIZE) { /* use permanent 1-to-1 kernel mapping of physical memory ? */ -#if __x86_64__ - *address = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << - PAGE_SHIFT); - *need_unmap = FALSE; - return KERN_SUCCESS; -#elif __arm__ || __arm64__ *address = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT); *need_unmap = FALSE; return KERN_SUCCESS; -#else -#warn "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..." -#endif assert(page->vmp_busy); /* @@ -9411,7 +10186,7 @@ vector_upl_create(vm_offset_t upl_offset) upl = upl_create(0, UPL_VECTOR, 0); upl->vector_upl = vector_upl; - upl->offset = upl_offset; + upl->u_offset = upl_offset; vector_upl->size = 0; vector_upl->offset = upl_offset; vector_upl->invalid_upls = 0; @@ -9479,7 +10254,7 @@ vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size) subupl->vector_upl = (void*)vector_upl; vector_upl->upl_elems[vector_upl->num_upls++] = subupl; vector_upl->size += io_size; - upl->size += io_size; + upl->u_size += io_size; } else { uint32_t i = 0, invalid_upls = 0; for (i = 0; i < vector_upl->num_upls; i++) { @@ -9492,7 +10267,8 @@ vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size) } vector_upl->upl_elems[i] = NULL; - invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1); + invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls, + relaxed); if (invalid_upls == vector_upl->num_upls) { return TRUE; } else { @@ -9525,7 +10301,7 @@ vector_upl_set_pagelist(upl_t upl) vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)); for (i = 0; i < vector_upl->num_upls; i++) { - cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size / PAGE_SIZE; + cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upl_elems[i], PAGE_MASK) / PAGE_SIZE; bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size); pagelist_size += cur_upl_pagelist_size; if (vector_upl->upl_elems[i]->highest_page > upl->highest_page) { @@ -10011,7 +10787,35 @@ upl_size_t upl_get_size( upl_t upl) { - return upl->size; + return upl_adjusted_size(upl, PAGE_MASK); +} + +upl_size_t +upl_adjusted_size( + upl_t upl, + vm_map_offset_t pgmask) +{ + vm_object_offset_t start_offset, end_offset; + + start_offset = trunc_page_mask_64(upl->u_offset, pgmask); + end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask); + + return (upl_size_t)(end_offset - start_offset); +} + +vm_object_offset_t +upl_adjusted_offset( + upl_t upl, + vm_map_offset_t pgmask) +{ + return trunc_page_mask_64(upl->u_offset, pgmask); +} + +vm_object_offset_t +upl_get_data_offset( + upl_t upl) +{ + return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK); } upl_t @@ -10130,440 +10934,3 @@ VM_PRESSURE_CRITICAL_TO_WARNING(void) } } #endif /* VM_PRESSURE_EVENTS */ - - - -#define VM_TEST_COLLAPSE_COMPRESSOR 0 -#define VM_TEST_WIRE_AND_EXTRACT 0 -#define VM_TEST_PAGE_WIRE_OVERFLOW_PANIC 0 -#if __arm64__ -#define VM_TEST_KERNEL_OBJECT_FAULT 0 -#endif /* __arm64__ */ -#define VM_TEST_DEVICE_PAGER_TRANSPOSE (DEVELOPMENT || DEBUG) - -#if VM_TEST_COLLAPSE_COMPRESSOR -extern boolean_t vm_object_collapse_compressor_allowed; -#include -static void -vm_test_collapse_compressor(void) -{ - vm_object_size_t backing_size, top_size; - vm_object_t backing_object, top_object; - vm_map_offset_t backing_offset, top_offset; - unsigned char *backing_address, *top_address; - kern_return_t kr; - - printf("VM_TEST_COLLAPSE_COMPRESSOR:\n"); - - /* create backing object */ - backing_size = 15 * PAGE_SIZE; - backing_object = vm_object_allocate(backing_size); - assert(backing_object != VM_OBJECT_NULL); - printf("VM_TEST_COLLAPSE_COMPRESSOR: created backing object %p\n", - backing_object); - /* map backing object */ - backing_offset = 0; - kr = vm_map_enter(kernel_map, &backing_offset, backing_size, 0, - VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, - backing_object, 0, FALSE, - VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT); - assert(kr == KERN_SUCCESS); - backing_address = (unsigned char *) backing_offset; - printf("VM_TEST_COLLAPSE_COMPRESSOR: " - "mapped backing object %p at 0x%llx\n", - backing_object, (uint64_t) backing_offset); - /* populate with pages to be compressed in backing object */ - backing_address[0x1 * PAGE_SIZE] = 0xB1; - backing_address[0x4 * PAGE_SIZE] = 0xB4; - backing_address[0x7 * PAGE_SIZE] = 0xB7; - backing_address[0xa * PAGE_SIZE] = 0xBA; - backing_address[0xd * PAGE_SIZE] = 0xBD; - printf("VM_TEST_COLLAPSE_COMPRESSOR: " - "populated pages to be compressed in " - "backing_object %p\n", backing_object); - /* compress backing object */ - vm_object_pageout(backing_object); - printf("VM_TEST_COLLAPSE_COMPRESSOR: compressing backing_object %p\n", - backing_object); - /* wait for all the pages to be gone */ - while (*(volatile int *)&backing_object->resident_page_count != 0) { - IODelay(10); - } - printf("VM_TEST_COLLAPSE_COMPRESSOR: backing_object %p compressed\n", - backing_object); - /* populate with pages to be resident in backing object */ - backing_address[0x0 * PAGE_SIZE] = 0xB0; - backing_address[0x3 * PAGE_SIZE] = 0xB3; - backing_address[0x6 * PAGE_SIZE] = 0xB6; - backing_address[0x9 * PAGE_SIZE] = 0xB9; - backing_address[0xc * PAGE_SIZE] = 0xBC; - printf("VM_TEST_COLLAPSE_COMPRESSOR: " - "populated pages to be resident in " - "backing_object %p\n", backing_object); - /* leave the other pages absent */ - /* mess with the paging_offset of the backing_object */ - assert(backing_object->paging_offset == 0); - backing_object->paging_offset = 0x3000; - - /* create top object */ - top_size = 9 * PAGE_SIZE; - top_object = vm_object_allocate(top_size); - assert(top_object != VM_OBJECT_NULL); - printf("VM_TEST_COLLAPSE_COMPRESSOR: created top object %p\n", - top_object); - /* map top object */ - top_offset = 0; - kr = vm_map_enter(kernel_map, &top_offset, top_size, 0, - VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, - top_object, 0, FALSE, - VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT); - assert(kr == KERN_SUCCESS); - top_address = (unsigned char *) top_offset; - printf("VM_TEST_COLLAPSE_COMPRESSOR: " - "mapped top object %p at 0x%llx\n", - top_object, (uint64_t) top_offset); - /* populate with pages to be compressed in top object */ - top_address[0x3 * PAGE_SIZE] = 0xA3; - top_address[0x4 * PAGE_SIZE] = 0xA4; - top_address[0x5 * PAGE_SIZE] = 0xA5; - printf("VM_TEST_COLLAPSE_COMPRESSOR: " - "populated pages to be compressed in " - "top_object %p\n", top_object); - /* compress top object */ - vm_object_pageout(top_object); - printf("VM_TEST_COLLAPSE_COMPRESSOR: compressing top_object %p\n", - top_object); - /* wait for all the pages to be gone */ - while (top_object->resident_page_count != 0) { - IODelay(10); - } - printf("VM_TEST_COLLAPSE_COMPRESSOR: top_object %p compressed\n", - top_object); - /* populate with pages to be resident in top object */ - top_address[0x0 * PAGE_SIZE] = 0xA0; - top_address[0x1 * PAGE_SIZE] = 0xA1; - top_address[0x2 * PAGE_SIZE] = 0xA2; - printf("VM_TEST_COLLAPSE_COMPRESSOR: " - "populated pages to be resident in " - "top_object %p\n", top_object); - /* leave the other pages absent */ - - /* link the 2 objects */ - vm_object_reference(backing_object); - top_object->shadow = backing_object; - top_object->vo_shadow_offset = 0x3000; - printf("VM_TEST_COLLAPSE_COMPRESSOR: linked %p and %p\n", - top_object, backing_object); - - /* unmap backing object */ - vm_map_remove(kernel_map, - backing_offset, - backing_offset + backing_size, - VM_MAP_REMOVE_NO_FLAGS); - printf("VM_TEST_COLLAPSE_COMPRESSOR: " - "unmapped backing_object %p [0x%llx:0x%llx]\n", - backing_object, - (uint64_t) backing_offset, - (uint64_t) (backing_offset + backing_size)); - - /* collapse */ - printf("VM_TEST_COLLAPSE_COMPRESSOR: collapsing %p\n", top_object); - vm_object_lock(top_object); - vm_object_collapse(top_object, 0, FALSE); - vm_object_unlock(top_object); - printf("VM_TEST_COLLAPSE_COMPRESSOR: collapsed %p\n", top_object); - - /* did it work? */ - if (top_object->shadow != VM_OBJECT_NULL) { - printf("VM_TEST_COLLAPSE_COMPRESSOR: not collapsed\n"); - printf("VM_TEST_COLLAPSE_COMPRESSOR: FAIL\n"); - if (vm_object_collapse_compressor_allowed) { - panic("VM_TEST_COLLAPSE_COMPRESSOR: FAIL\n"); - } - } else { - /* check the contents of the mapping */ - unsigned char expect[9] = - { 0xA0, 0xA1, 0xA2, /* resident in top */ - 0xA3, 0xA4, 0xA5, /* compressed in top */ - 0xB9, /* resident in backing + shadow_offset */ - 0xBD, /* compressed in backing + shadow_offset + paging_offset */ - 0x00 }; /* absent in both */ - unsigned char actual[9]; - unsigned int i, errors; - - errors = 0; - for (i = 0; i < sizeof(actual); i++) { - actual[i] = (unsigned char) top_address[i * PAGE_SIZE]; - if (actual[i] != expect[i]) { - errors++; - } - } - printf("VM_TEST_COLLAPSE_COMPRESSOR: " - "actual [%x %x %x %x %x %x %x %x %x] " - "expect [%x %x %x %x %x %x %x %x %x] " - "%d errors\n", - actual[0], actual[1], actual[2], actual[3], - actual[4], actual[5], actual[6], actual[7], - actual[8], - expect[0], expect[1], expect[2], expect[3], - expect[4], expect[5], expect[6], expect[7], - expect[8], - errors); - if (errors) { - panic("VM_TEST_COLLAPSE_COMPRESSOR: FAIL\n"); - } else { - printf("VM_TEST_COLLAPSE_COMPRESSOR: PASS\n"); - } - } -} -#else /* VM_TEST_COLLAPSE_COMPRESSOR */ -#define vm_test_collapse_compressor() -#endif /* VM_TEST_COLLAPSE_COMPRESSOR */ - -#if VM_TEST_WIRE_AND_EXTRACT -extern ledger_template_t task_ledger_template; -#include -extern ppnum_t vm_map_get_phys_page(vm_map_t map, - vm_offset_t offset); -static void -vm_test_wire_and_extract(void) -{ - ledger_t ledger; - vm_map_t user_map, wire_map; - mach_vm_address_t user_addr, wire_addr; - mach_vm_size_t user_size, wire_size; - mach_vm_offset_t cur_offset; - vm_prot_t cur_prot, max_prot; - ppnum_t user_ppnum, wire_ppnum; - kern_return_t kr; - - ledger = ledger_instantiate(task_ledger_template, - LEDGER_CREATE_ACTIVE_ENTRIES); - user_map = vm_map_create(pmap_create(ledger, 0, PMAP_CREATE_64BIT), - 0x100000000ULL, - 0x200000000ULL, - TRUE); - wire_map = vm_map_create(NULL, - 0x100000000ULL, - 0x200000000ULL, - TRUE); - user_addr = 0; - user_size = 0x10000; - kr = mach_vm_allocate(user_map, - &user_addr, - user_size, - VM_FLAGS_ANYWHERE); - assert(kr == KERN_SUCCESS); - wire_addr = 0; - wire_size = user_size; - kr = mach_vm_remap(wire_map, - &wire_addr, - wire_size, - 0, - VM_FLAGS_ANYWHERE, - user_map, - user_addr, - FALSE, - &cur_prot, - &max_prot, - VM_INHERIT_NONE); - assert(kr == KERN_SUCCESS); - for (cur_offset = 0; - cur_offset < wire_size; - cur_offset += PAGE_SIZE) { - kr = vm_map_wire_and_extract(wire_map, - wire_addr + cur_offset, - VM_PROT_DEFAULT | VM_PROT_MEMORY_TAG_MAKE(VM_KERN_MEMORY_OSFMK), - TRUE, - &wire_ppnum); - assert(kr == KERN_SUCCESS); - user_ppnum = vm_map_get_phys_page(user_map, - user_addr + cur_offset); - printf("VM_TEST_WIRE_AND_EXTRACT: kr=0x%x " - "user[%p:0x%llx:0x%x] wire[%p:0x%llx:0x%x]\n", - kr, - user_map, user_addr + cur_offset, user_ppnum, - wire_map, wire_addr + cur_offset, wire_ppnum); - if (kr != KERN_SUCCESS || - wire_ppnum == 0 || - wire_ppnum != user_ppnum) { - panic("VM_TEST_WIRE_AND_EXTRACT: FAIL\n"); - } - } - cur_offset -= PAGE_SIZE; - kr = vm_map_wire_and_extract(wire_map, - wire_addr + cur_offset, - VM_PROT_DEFAULT, - TRUE, - &wire_ppnum); - assert(kr == KERN_SUCCESS); - printf("VM_TEST_WIRE_AND_EXTRACT: re-wire kr=0x%x " - "user[%p:0x%llx:0x%x] wire[%p:0x%llx:0x%x]\n", - kr, - user_map, user_addr + cur_offset, user_ppnum, - wire_map, wire_addr + cur_offset, wire_ppnum); - if (kr != KERN_SUCCESS || - wire_ppnum == 0 || - wire_ppnum != user_ppnum) { - panic("VM_TEST_WIRE_AND_EXTRACT: FAIL\n"); - } - - printf("VM_TEST_WIRE_AND_EXTRACT: PASS\n"); -} -#else /* VM_TEST_WIRE_AND_EXTRACT */ -#define vm_test_wire_and_extract() -#endif /* VM_TEST_WIRE_AND_EXTRACT */ - -#if VM_TEST_PAGE_WIRE_OVERFLOW_PANIC -static void -vm_test_page_wire_overflow_panic(void) -{ - vm_object_t object; - vm_page_t page; - - printf("VM_TEST_PAGE_WIRE_OVERFLOW_PANIC: starting...\n"); - - object = vm_object_allocate(PAGE_SIZE); - vm_object_lock(object); - page = vm_page_alloc(object, 0x0); - vm_page_lock_queues(); - do { - vm_page_wire(page, 1, FALSE); - } while (page->wire_count != 0); - vm_page_unlock_queues(); - vm_object_unlock(object); - panic("FBDP(%p,%p): wire_count overflow not detected\n", - object, page); -} -#else /* VM_TEST_PAGE_WIRE_OVERFLOW_PANIC */ -#define vm_test_page_wire_overflow_panic() -#endif /* VM_TEST_PAGE_WIRE_OVERFLOW_PANIC */ - -#if __arm64__ && VM_TEST_KERNEL_OBJECT_FAULT -extern int copyinframe(vm_address_t fp, char *frame, boolean_t is64bit); -static void -vm_test_kernel_object_fault(void) -{ - kern_return_t kr; - vm_offset_t stack; - uintptr_t frameb[2]; - int ret; - - kr = kernel_memory_allocate(kernel_map, &stack, - kernel_stack_size + (2 * PAGE_SIZE), - 0, - (KMA_KSTACK | KMA_KOBJECT | - KMA_GUARD_FIRST | KMA_GUARD_LAST), - VM_KERN_MEMORY_STACK); - if (kr != KERN_SUCCESS) { - panic("VM_TEST_KERNEL_OBJECT_FAULT: kernel_memory_allocate kr 0x%x\n", kr); - } - ret = copyinframe((uintptr_t)stack, (char *)frameb, TRUE); - if (ret != 0) { - printf("VM_TEST_KERNEL_OBJECT_FAULT: PASS\n"); - } else { - printf("VM_TEST_KERNEL_OBJECT_FAULT: FAIL\n"); - } - vm_map_remove(kernel_map, - stack, - stack + kernel_stack_size + (2 * PAGE_SIZE), - VM_MAP_REMOVE_KUNWIRE); - stack = 0; -} -#else /* __arm64__ && VM_TEST_KERNEL_OBJECT_FAULT */ -#define vm_test_kernel_object_fault() -#endif /* __arm64__ && VM_TEST_KERNEL_OBJECT_FAULT */ - -#if VM_TEST_DEVICE_PAGER_TRANSPOSE -static void -vm_test_device_pager_transpose(void) -{ - memory_object_t device_pager; - vm_object_t anon_object, device_object; - vm_size_t size; - vm_map_offset_t device_mapping; - kern_return_t kr; - - size = 3 * PAGE_SIZE; - anon_object = vm_object_allocate(size); - assert(anon_object != VM_OBJECT_NULL); - device_pager = device_pager_setup(NULL, 0, size, 0); - assert(device_pager != NULL); - device_object = memory_object_to_vm_object(device_pager); - assert(device_object != VM_OBJECT_NULL); -#if 0 - /* - * Can't actually map this, since another thread might do a - * vm_map_enter() that gets coalesced into this object, which - * would cause the test to fail. - */ - vm_map_offset_t anon_mapping = 0; - kr = vm_map_enter(kernel_map, &anon_mapping, size, 0, - VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE, - anon_object, 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL, - VM_INHERIT_DEFAULT); - assert(kr == KERN_SUCCESS); -#endif - device_mapping = 0; - kr = vm_map_enter_mem_object(kernel_map, &device_mapping, size, 0, - VM_FLAGS_ANYWHERE, - VM_MAP_KERNEL_FLAGS_NONE, - VM_KERN_MEMORY_NONE, - (void *)device_pager, 0, FALSE, - VM_PROT_DEFAULT, VM_PROT_ALL, - VM_INHERIT_DEFAULT); - assert(kr == KERN_SUCCESS); - memory_object_deallocate(device_pager); - - vm_object_lock(anon_object); - vm_object_activity_begin(anon_object); - anon_object->blocked_access = TRUE; - vm_object_unlock(anon_object); - vm_object_lock(device_object); - vm_object_activity_begin(device_object); - device_object->blocked_access = TRUE; - vm_object_unlock(device_object); - - assert(anon_object->ref_count == 1); - assert(!anon_object->named); - assert(device_object->ref_count == 2); - assert(device_object->named); - - kr = vm_object_transpose(device_object, anon_object, size); - assert(kr == KERN_SUCCESS); - - vm_object_lock(anon_object); - vm_object_activity_end(anon_object); - anon_object->blocked_access = FALSE; - vm_object_unlock(anon_object); - vm_object_lock(device_object); - vm_object_activity_end(device_object); - device_object->blocked_access = FALSE; - vm_object_unlock(device_object); - - assert(anon_object->ref_count == 2); - assert(anon_object->named); -#if 0 - kr = vm_deallocate(kernel_map, anon_mapping, size); - assert(kr == KERN_SUCCESS); -#endif - assert(device_object->ref_count == 1); - assert(!device_object->named); - kr = vm_deallocate(kernel_map, device_mapping, size); - assert(kr == KERN_SUCCESS); - - printf("VM_TEST_DEVICE_PAGER_TRANSPOSE: PASS\n"); -} -#else /* VM_TEST_DEVICE_PAGER_TRANSPOSE */ -#define vm_test_device_pager_transpose() -#endif /* VM_TEST_DEVICE_PAGER_TRANSPOSE */ - -void -vm_tests(void) -{ - vm_test_collapse_compressor(); - vm_test_wire_and_extract(); - vm_test_page_wire_overflow_panic(); - vm_test_kernel_object_fault(); - vm_test_device_pager_transpose(); -}