X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/2d21ac55c334faf3a56e5634905ed6987fc787d4..0b4c1975fb5e4eccf1012a35081f7e7799b81046:/osfmk/vm/vm_fault.c diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index 87ffc3ee7..a36714b57 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -107,22 +107,47 @@ #define VM_FAULT_CLASSIFY 0 -/* Zero-filled pages are marked "m->zero_fill" and put on the - * special zero-fill inactive queue only if they belong to - * an object at least this big. - */ -#define VM_ZF_OBJECT_SIZE_THRESHOLD (0x200000) - #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */ int vm_object_pagein_throttle = 16; +/* + * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which + * kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts + * of memory if they're buggy and can run the system completely out of swap space. If this happens, we + * impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps + * keep the UI active so that the user has a chance to kill the offending task before the system + * completely hangs. + * + * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied + * to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold + * will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a + * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again. + */ + +boolean_t thread_is_io_throttled(void); + +uint64_t vm_hard_throttle_threshold; + +extern unsigned int dp_pages_free, dp_pages_reserve; + +#define NEED_TO_HARD_THROTTLE_THIS_TASK() (((dp_pages_free + dp_pages_reserve < 2000) && \ + (get_task_resident_size(current_task()) > vm_hard_throttle_threshold) && \ + (current_task() != kernel_task) && IP_VALID(memory_manager_default)) || \ + (vm_page_free_count < vm_page_throttle_limit && thread_is_io_throttled() && \ + (get_task_resident_size(current_task()) > vm_hard_throttle_threshold))) + + +#define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */ + + extern int cs_debug; #if MACH_KDB extern struct db_watchpoint *db_watchpoint_list; #endif /* MACH_KDB */ +boolean_t current_thread_aborted(void); /* Forward declarations of internal routines. */ extern kern_return_t vm_fault_wire_fast( @@ -149,6 +174,18 @@ extern void vm_fault_classify(vm_object_t object, extern void vm_fault_classify_init(void); #endif +unsigned long vm_pmap_enter_blocked = 0; + +unsigned long vm_cs_validates = 0; +unsigned long vm_cs_revalidates = 0; +unsigned long vm_cs_query_modified = 0; +unsigned long vm_cs_validated_dirtied = 0; +#if CONFIG_ENFORCE_SIGNED_CODE +int cs_enforcement_disable=0; +#else +static const int cs_enforcement_disable=1; +#endif + /* * Routine: vm_fault_init * Purpose: @@ -157,6 +194,24 @@ extern void vm_fault_classify_init(void); void vm_fault_init(void) { +#if !SECURE_KERNEL +#if CONFIG_ENFORCE_SIGNED_CODE + PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable, + sizeof (cs_enforcement_disable)); +#endif + PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug)); +#endif + + /* + * Choose a value for the hard throttle threshold based on the amount of ram. The threshold is + * computed as a percentage of available memory, and the percentage used is scaled inversely with + * the amount of memory. The pertange runs between 10% and 35%. We use 35% for small memory systems + * and reduce the value down to 10% for very large memory configurations. This helps give us a + * definition of a memory hog that makes more sense relative to the amount of ram in the machine. + * The formula here simply uses the number of gigabytes of ram to adjust the percentage. + */ + + vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100; } /* @@ -216,8 +271,11 @@ boolean_t vm_page_deactivate_behind = TRUE; /* * default sizes given VM_BEHAVIOR_DEFAULT reference behavior */ -int vm_default_ahead = 0; -int vm_default_behind = MAX_UPL_TRANSFER; +#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128 +#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */ + /* we use it to size an array on the stack */ + +int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW; #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024) @@ -334,6 +392,8 @@ vm_fault_is_sequential( } +int vm_page_deactivate_behind_count = 0; + /* * vm_page_deactivate_behind * @@ -353,10 +413,17 @@ vm_fault_deactivate_behind( vm_object_offset_t offset, vm_behavior_t behavior) { - vm_page_t m = NULL; + int n; + int pages_in_run = 0; + int max_pages_in_run = 0; int sequential_run; int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL; + vm_object_offset_t run_offset = 0; + vm_object_offset_t pg_offset = 0; + vm_page_t m; + vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER]; + pages_in_run = 0; #if TRACEFAULTPAGE dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */ #endif @@ -381,12 +448,16 @@ vm_fault_deactivate_behind( case VM_BEHAVIOR_RANDOM: break; case VM_BEHAVIOR_SEQUENTIAL: - if (sequential_run >= (int)PAGE_SIZE) - m = vm_page_lookup(object, offset - PAGE_SIZE_64); + if (sequential_run >= (int)PAGE_SIZE) { + run_offset = 0 - PAGE_SIZE_64; + max_pages_in_run = 1; + } break; case VM_BEHAVIOR_RSEQNTL: - if (sequential_run >= (int)PAGE_SIZE) - m = vm_page_lookup(object, offset + PAGE_SIZE_64); + if (sequential_run >= (int)PAGE_SIZE) { + run_offset = PAGE_SIZE_64; + max_pages_in_run = 1; + } break; case VM_BEHAVIOR_DEFAULT: default: @@ -397,32 +468,109 @@ vm_fault_deactivate_behind( * long enough on an object with default access behavior * to consider it for deactivation */ - if ((uint64_t)sequential_run >= behind) { + if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) { + /* + * the comparisons between offset and behind are done + * in this kind of odd fashion in order to prevent wrap around + * at the end points + */ if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) { - if (offset >= behind) - m = vm_page_lookup(object, offset - behind); + if (offset >= behind) { + run_offset = 0 - behind; + pg_offset = PAGE_SIZE_64; + max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER; + } } else { - if (offset < -behind) - m = vm_page_lookup(object, offset + behind); + if (offset < -behind) { + run_offset = behind; + pg_offset = 0 - PAGE_SIZE_64; + max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER; + } } } break; } } - if (m) { - if (!m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) { - pmap_clear_reference(m->phys_page); - m->deactivated = TRUE; + for (n = 0; n < max_pages_in_run; n++) { + m = vm_page_lookup(object, offset + run_offset + (n * pg_offset)); + + if (m && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) { + page_run[pages_in_run++] = m; + pmap_clear_reference(m->phys_page); + } + } + if (pages_in_run) { + vm_page_lockspin_queues(); + + for (n = 0; n < pages_in_run; n++) { + + m = page_run[n]; + + vm_page_deactivate_internal(m, FALSE); + + vm_page_deactivate_behind_count++; #if TRACEFAULTPAGE dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ #endif - return TRUE; } + vm_page_unlock_queues(); + + return TRUE; } return FALSE; } +static boolean_t +vm_page_throttled(void) +{ + clock_sec_t elapsed_sec; + clock_sec_t tv_sec; + clock_usec_t tv_usec; + + thread_t thread = current_thread(); + + if (thread->options & TH_OPT_VMPRIV) + return (FALSE); + + thread->t_page_creation_count++; + + if (NEED_TO_HARD_THROTTLE_THIS_TASK()) + return (TRUE); + + if (vm_page_free_count < vm_page_throttle_limit && + thread->t_page_creation_count > vm_page_creation_throttle) { + + clock_get_system_microtime(&tv_sec, &tv_usec); + + elapsed_sec = tv_sec - thread->t_page_creation_time; + + if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) { + + if (elapsed_sec >= 60) { + /* + * we'll reset our stats to give a well behaved app + * that was unlucky enough to accumulate a bunch of pages + * over a long period of time a chance to get out of + * the throttled state... we reset the counter and timestamp + * so that if it stays under the rate limit for the next second + * it will be back in our good graces... if it exceeds it, it + * will remain in the throttled state + */ + thread->t_page_creation_time = tv_sec; + thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5; + } + ++vm_page_throttle_count; + + return (TRUE); + } + thread->t_page_creation_time = tv_sec; + thread->t_page_creation_count = 0; + } + return (FALSE); +} + + /* * check for various conditions that would * prevent us from creating a ZF page... @@ -434,10 +582,14 @@ vm_fault_deactivate_behind( static vm_fault_return_t vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state) { - if (object->shadow_severed) { + if (object->shadow_severed || + VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) { /* - * the shadow chain was severed - * just have to return an error at this point + * Either: + * 1. the shadow chain was severed, + * 2. the purgeable object is volatile or empty and is marked + * to fault on access while volatile. + * Just have to return an error at this point */ if (m != VM_PAGE_NULL) VM_PAGE_FREE(m); @@ -467,7 +619,7 @@ vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t int return (VM_FAULT_RETRY); } } - if (VM_PAGE_ZFILL_THROTTLED()) { + if (vm_page_throttled()) { /* * we're throttling zero-fills... * treat this as if we couldn't grab a page @@ -476,6 +628,15 @@ vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t int VM_PAGE_FREE(m); vm_fault_cleanup(object, first_m); + if (NEED_TO_HARD_THROTTLE_THIS_TASK()) { + delay(HARD_THROTTLE_DELAY); + + if (current_thread_aborted()) { + thread_interrupt_level(interruptible_state); + return VM_FAULT_INTERRUPTED; + } + } + thread_interrupt_level(interruptible_state); return (VM_FAULT_MEMORY_SHORTAGE); @@ -530,8 +691,9 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) if (!IP_VALID(memory_manager_default) && (m->object->purgable == VM_PURGABLE_DENY || - m->object->purgable == VM_PURGABLE_NONVOLATILE)) { - vm_page_lock_queues(); + m->object->purgable == VM_PURGABLE_NONVOLATILE || + m->object->purgable == VM_PURGABLE_VOLATILE )) { + vm_page_lockspin_queues(); queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); m->throttled = TRUE; @@ -539,9 +701,9 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) vm_page_unlock_queues(); } else { - if (m->object->size > VM_ZF_OBJECT_SIZE_THRESHOLD) { + if (current_thread()->t_page_creation_count > vm_page_creation_throttle) { m->zero_fill = TRUE; - OSAddAtomic(1, (SInt32 *)&vm_zf_count); + VM_ZF_COUNT_INCR(); } } return (my_fault); @@ -589,7 +751,14 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) * be destroyed when this guarantee is no longer required. * The "result_page" is also left busy. It is not removed * from the pageout queues. + * Special Case: + * A return value of VM_FAULT_SUCCESS_NO_PAGE means that the + * fault succeeded but there's no VM page (i.e. the VM object + * does not actually hold VM pages, but device memory or + * large pages). The object is still locked and we still hold a + * paging_in_progress reference. */ +unsigned int vm_fault_page_blocked_access = 0; vm_fault_return_t vm_fault_page( @@ -636,6 +805,7 @@ vm_fault_page( uint32_t try_failed_count; int interruptible; /* how may fault be interrupted? */ memory_object_t pager; + vm_fault_return_t retval; /* * MACH page map - an optional optimization where a bit map is maintained @@ -676,23 +846,15 @@ vm_fault_page( /* * Recovery actions */ -#define PREPARE_RELEASE_PAGE(m) \ - MACRO_BEGIN \ - vm_page_lock_queues(); \ - MACRO_END - -#define DO_RELEASE_PAGE(m) \ - MACRO_BEGIN \ - PAGE_WAKEUP_DONE(m); \ - if (!m->active && !m->inactive && !m->throttled)\ - vm_page_activate(m); \ - vm_page_unlock_queues(); \ - MACRO_END - #define RELEASE_PAGE(m) \ MACRO_BEGIN \ - PREPARE_RELEASE_PAGE(m); \ - DO_RELEASE_PAGE(m); \ + PAGE_WAKEUP_DONE(m); \ + if (!m->active && !m->inactive && !m->throttled) { \ + vm_page_lockspin_queues(); \ + if (!m->active && !m->inactive && !m->throttled) \ + vm_page_activate(m); \ + vm_page_unlock_queues(); \ + } \ MACRO_END #if TRACEFAULTPAGE @@ -755,7 +917,7 @@ vm_fault_page( XPR(XPR_VM_FAULT, "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n", - (integer_t)object, offset, fault_type, *protection, 0); + object, offset, fault_type, *protection, 0); /* * default type of fault @@ -777,6 +939,35 @@ vm_fault_page( return (VM_FAULT_MEMORY_ERROR); } + if (!object->pager_created && object->phys_contiguous) { + /* + * A physically-contiguous object without a pager: + * must be a "large page" object. We do not deal + * with VM pages for this object. + */ + m = VM_PAGE_NULL; + goto phys_contig_object; + } + + if (object->blocked_access) { + /* + * Access to this VM object has been blocked. + * Replace our "paging_in_progress" reference with + * a "activity_in_progress" reference and wait for + * access to be unblocked. + */ + vm_object_activity_begin(object); + vm_object_paging_end(object); + while (object->blocked_access) { + vm_object_sleep(object, + VM_OBJECT_EVENT_UNBLOCKED, + THREAD_UNINT); + } + vm_fault_page_blocked_access++; + vm_object_paging_begin(object); + vm_object_activity_end(object); + } + /* * See whether the page at 'offset' is resident */ @@ -802,8 +993,8 @@ vm_fault_page( wait_result = PAGE_SLEEP(object, m, interruptible); XPR(XPR_VM_FAULT, "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n", - (integer_t)object, offset, - (integer_t)m, 0, 0); + object, offset, + m, 0, 0); counter(c_vm_fault_page_block_busy_kernel++); if (wait_result != THREAD_AWAKENED) { @@ -917,9 +1108,9 @@ vm_fault_page( XPR(XPR_VM_FAULT, "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n", - (integer_t)object, offset, - (integer_t)m, - (integer_t)first_object, 0); + object, offset, + m, + first_object, 0); if (object != first_object) { /* @@ -959,6 +1150,8 @@ vm_fault_page( */ my_fault = vm_fault_zero_page(m, no_zero_fill); + if (fault_info->mark_zf_absent && no_zero_fill == TRUE) + m->absent = TRUE; break; } else { if (must_be_resident) @@ -977,8 +1170,8 @@ vm_fault_page( } XPR(XPR_VM_FAULT, "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n", - (integer_t)object, offset, - (integer_t)next_object, + object, offset, + next_object, offset+object->shadow_offset,0); offset += object->shadow_offset; @@ -1017,8 +1210,8 @@ vm_fault_page( #endif XPR(XPR_VM_FAULT, "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n", - (integer_t)object, offset, - (integer_t)m, 0, 0); + object, offset, + m, 0, 0); /* * take an extra ref so that object won't die */ @@ -1049,7 +1242,8 @@ vm_fault_page( return (VM_FAULT_RETRY); } } - if (type_of_fault == NULL && m->speculative) { + if (type_of_fault == NULL && m->speculative && + !(fault_info != NULL && fault_info->stealth)) { /* * If we were passed a non-NULL pointer for * "type_of_fault", than we came from @@ -1060,6 +1254,10 @@ vm_fault_page( * take it off the speculative queue, we'll * let the caller of vm_fault_page deal * with getting it onto the correct queue + * + * If the caller specified in fault_info that + * it wants a "stealth" fault, we also leave + * the page in the speculative queue. */ vm_page_lockspin_queues(); VM_PAGE_QUEUES_REMOVE(m); @@ -1114,7 +1312,7 @@ vm_fault_page( #endif XPR(XPR_VM_FAULT, "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n", - (integer_t)object, offset, (integer_t)m, 0, 0); + object, offset, m, 0, 0); assert(!m->busy); assert(!m->absent); @@ -1168,7 +1366,7 @@ vm_fault_page( XPR(XPR_VM_FAULT, "vm_f_page: ready wait obj 0x%X, offset 0x%X\n", - (integer_t)object, offset, 0, 0, 0); + object, offset, 0, 0, 0); /* * take an extra ref so object won't die @@ -1288,7 +1486,7 @@ vm_fault_page( XPR(XPR_VM_FAULT, "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n", - (integer_t)object, offset, (integer_t)m, + object, offset, m, access_required | wants_copy_flag, 0); /* @@ -1314,6 +1512,13 @@ vm_fault_page( return ((rc == MACH_SEND_INTERRUPTED) ? VM_FAULT_INTERRUPTED : VM_FAULT_MEMORY_ERROR); + } else { + clock_sec_t tv_sec; + clock_usec_t tv_usec; + + clock_get_system_microtime(&tv_sec, &tv_usec); + current_thread()->t_page_creation_time = tv_sec; + current_thread()->t_page_creation_count = 0; } if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) { @@ -1336,7 +1541,8 @@ vm_fault_page( * page fault against the object's new backing * store (different memory object). */ - break; + phys_contig_object: + goto done; } /* * potentially a pagein fault @@ -1370,8 +1576,8 @@ vm_fault_page( XPR(XPR_VM_FAULT, "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n", - (integer_t)object, offset, (integer_t)m, - (integer_t)object->shadow, 0); + object, offset, m, + object->shadow, 0); next_object = object->shadow; @@ -1419,6 +1625,8 @@ vm_fault_page( } my_fault = vm_fault_zero_page(m, no_zero_fill); + if (fault_info->mark_zf_absent && no_zero_fill == TRUE) + m->absent = TRUE; break; } else { @@ -1464,12 +1672,10 @@ vm_fault_page( dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ #endif #if EXTRA_ASSERTIONS - if (m != VM_PAGE_NULL) { - assert(m->busy && !m->absent); - assert((first_m == VM_PAGE_NULL) || - (first_m->busy && !first_m->absent && - !first_m->active && !first_m->inactive)); - } + assert(m->busy && !m->absent); + assert((first_m == VM_PAGE_NULL) || + (first_m->busy && !first_m->absent && + !first_m->active && !first_m->inactive)); #endif /* EXTRA_ASSERTIONS */ /* @@ -1477,14 +1683,12 @@ vm_fault_page( * If we found a page, we must have decrypted it before we * get here... */ - if (m != VM_PAGE_NULL) { - ASSERT_PAGE_DECRYPTED(m); - } + ASSERT_PAGE_DECRYPTED(m); XPR(XPR_VM_FAULT, "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n", - (integer_t)object, offset, (integer_t)m, - (integer_t)first_object, (integer_t)first_m); + object, offset, m, + first_object, first_m); /* * If the page is being written, but isn't @@ -1492,7 +1696,7 @@ vm_fault_page( * we have to copy it into a new page owned * by the top-level object. */ - if ((object != first_object) && (m != VM_PAGE_NULL)) { + if (object != first_object) { #if TRACEFAULTPAGE dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */ @@ -1556,8 +1760,8 @@ vm_fault_page( } XPR(XPR_VM_FAULT, "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n", - (integer_t)object, offset, - (integer_t)m, (integer_t)copy_m, 0); + object, offset, + m, copy_m, 0); vm_page_copy(m, copy_m); @@ -1632,7 +1836,7 @@ vm_fault_page( */ try_failed_count = 0; - while ((copy_object = first_object->copy) != VM_OBJECT_NULL && (m != VM_PAGE_NULL)) { + while ((copy_object = first_object->copy) != VM_OBJECT_NULL) { vm_object_offset_t copy_offset; vm_page_t copy_m; @@ -1897,14 +2101,17 @@ vm_fault_page( break; } + +done: *result_page = m; *top_page = first_m; XPR(XPR_VM_FAULT, "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n", - (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0); + object, offset, m, first_m, 0); if (m != VM_PAGE_NULL) { + retval = VM_FAULT_SUCCESS; if (my_fault == DBG_PAGEIN_FAULT) { VM_STAT_INCR(pageins); @@ -1914,8 +2121,10 @@ vm_fault_page( if (m->object->internal) { DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL); + my_fault = DBG_PAGEIND_FAULT; } else { DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL); + my_fault = DBG_PAGEINV_FAULT; } /* @@ -1929,15 +2138,18 @@ vm_fault_page( } if (type_of_fault) *type_of_fault = my_fault; - } else - vm_object_unlock(object); + } else { + retval = VM_FAULT_SUCCESS_NO_VM_PAGE; + assert(first_m == VM_PAGE_NULL); + assert(object == first_object); + } thread_interrupt_level(interruptible_state); #if TRACEFAULTPAGE dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */ #endif - return (VM_FAULT_SUCCESS); + return retval; backoff: thread_interrupt_level(interruptible_state); @@ -1951,6 +2163,21 @@ backoff: +/* + * CODE SIGNING: + * When soft faulting a page, we have to validate the page if: + * 1. the page is being mapped in user space + * 2. the page hasn't already been found to be "tainted" + * 3. the page belongs to a code-signed object + * 4. the page has not been validated yet or has been mapped for write. + */ +#define VM_FAULT_NEED_CS_VALIDATION(pmap, page) \ + ((pmap) != kernel_pmap /*1*/ && \ + !(page)->cs_tainted /*2*/ && \ + (page)->object->code_signed /*3*/ && \ + (!(page)->cs_validated || (page)->wpmapped /*4*/)) + + /* * page queue lock must NOT be held * m->object must be locked @@ -1973,12 +2200,14 @@ vm_fault_enter(vm_page_t m, int *type_of_fault) { unsigned int cache_attr; - kern_return_t kr; + kern_return_t kr, pe_result; boolean_t previously_pmapped = m->pmapped; - + boolean_t must_disconnect = 0; + boolean_t map_is_switched, map_is_switch_protected; + vm_object_lock_assert_held(m->object); #if DEBUG - mutex_assert(&vm_page_queue_lock, MA_NOTOWNED); + lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); #endif /* DEBUG */ if (m->phys_page == vm_page_guard_addr) { @@ -1988,22 +2217,6 @@ vm_fault_enter(vm_page_t m, cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK; - if (m->object->code_signed && !m->cs_validated && - pmap != kernel_pmap) { - /* - * CODE SIGNING: - * This page comes from a VM object backed by a - * signed memory object and it hasn't been validated yet. - * We're about to enter it into a process address space, - * so we need to validate its signature now. - */ - vm_object_lock_assert_exclusive(m->object); - - /* VM map still locked, so 1 ref will remain on VM object */ - - vm_page_validate_cs(m); - } - if (m->pmapped == FALSE) { /* * This is the first time this page is being @@ -2028,13 +2241,13 @@ vm_fault_enter(vm_page_t m, if (m->object->internal) { DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL); + *type_of_fault = DBG_PAGEIND_FAULT; } else { DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL); + *type_of_fault = DBG_PAGEINV_FAULT; } current_task()->pageins++; - - *type_of_fault = DBG_PAGEIN_FAULT; } VM_PAGE_CONSUME_CLUSTERED(m); @@ -2049,33 +2262,120 @@ vm_fault_enter(vm_page_t m, } } - if (m->cs_tainted) { - /* - * CODE SIGNING: - * This page has been tainted and can not be trusted. - * Let's notify the current process and let it take any - * necessary precautions before we enter the tainted page - * into its address space. - */ - if (cs_invalid_page()) { + /* Validate code signature if necessary. */ + if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) { + vm_object_lock_assert_exclusive(m->object); + + if (m->cs_validated) { + vm_cs_revalidates++; + } + + /* VM map is locked, so 1 ref will remain on VM object - + * so no harm if vm_page_validate_cs drops the object lock */ + vm_page_validate_cs(m); + } + +#define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/) + + map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) && + (pmap == vm_map_pmap(current_thread()->map))); + map_is_switch_protected = current_thread()->map->switch_protect; + + /* If the map is switched, and is switch-protected, we must protect + * some pages from being write-faulted: immutable pages because by + * definition they may not be written, and executable pages because that + * would provide a way to inject unsigned code. + * If the page is immutable, we can simply return. However, we can't + * immediately determine whether a page is executable anywhere. But, + * we can disconnect it everywhere and remove the executable protection + * from the current map. We do that below right before we do the + * PMAP_ENTER. + */ + if(!cs_enforcement_disable && map_is_switched && + map_is_switch_protected && page_immutable(m, prot) && + (prot & VM_PROT_WRITE)) + { + return KERN_CODESIGN_ERROR; + } + + /* A page could be tainted, or pose a risk of being tainted later. + * Check whether the receiving process wants it, and make it feel + * the consequences (that hapens in cs_invalid_page()). + * For CS Enforcement, two other conditions will + * cause that page to be tainted as well: + * - pmapping an unsigned page executable - this means unsigned code; + * - writeable mapping of a validated page - the content of that page + * can be changed without the kernel noticing, therefore unsigned + * code can be created + */ + if (m->cs_tainted || + ( !cs_enforcement_disable && + (/* The page is unsigned and wants to be executable */ + (!m->cs_validated && (prot & VM_PROT_EXECUTE)) || + /* The page should be immutable, but is in danger of being modified + * This is the case where we want policy from the code directory - + * is the page immutable or not? For now we have to assume that + * code pages will be immutable, data pages not. + * We'll assume a page is a code page if it has a code directory + * and we fault for execution. + * That is good enough since if we faulted the code page for + * writing in another map before, it is wpmapped; if we fault + * it for writing in this map later it will also be faulted for executing + * at the same time; and if we fault for writing in another map + * later, we will disconnect it from this pmap so we'll notice + * the change. + */ + (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped)) + )) + ) + { + /* We will have a tainted page. Have to handle the special case + * of a switched map now. If the map is not switched, standard + * procedure applies - call cs_invalid_page(). + * If the map is switched, the real owner is invalid already. + * There is no point in invalidating the switching process since + * it will not be executing from the map. So we don't call + * cs_invalid_page() in that case. */ + boolean_t reject_page; + if(map_is_switched) { + assert(pmap==vm_map_pmap(current_thread()->map)); + assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE)); + reject_page = FALSE; + } else { + reject_page = cs_invalid_page((addr64_t) vaddr); + } + + if (reject_page) { /* reject the tainted page: abort the page fault */ - kr = KERN_MEMORY_ERROR; + kr = KERN_CODESIGN_ERROR; cs_enter_tainted_rejected++; } else { /* proceed with the tainted page */ kr = KERN_SUCCESS; + /* Page might have been tainted before or not; now it + * definitively is. If the page wasn't tainted, we must + * disconnect it from all pmaps later. */ + must_disconnect = !m->cs_tainted; + m->cs_tainted = TRUE; cs_enter_tainted_accepted++; } if (cs_debug || kr != KERN_SUCCESS) { printf("CODESIGNING: vm_fault_enter(0x%llx): " - "page %p obj %p off 0x%llx *** TAINTED ***\n", + "page %p obj %p off 0x%llx *** INVALID PAGE ***\n", (long long)vaddr, m, m->object, m->offset); } + } else { /* proceed with the valid page */ kr = KERN_SUCCESS; } + /* If we have a KERN_SUCCESS from the previous checks, we either have + * a good page, or a tainted page that has been accepted by the process. + * In both cases the page will be entered into the pmap. + * If the page is writeable, we need to disconnect it from other pmaps + * now so those processes can take note. + */ if (kr == KERN_SUCCESS) { /* * NOTE: we may only hold the vm_object lock SHARED @@ -2083,12 +2383,56 @@ vm_fault_enter(vm_page_t m, * since this is the ONLY bit updated behind the SHARED * lock... however, we need to figure out how to do an atomic * update on a bit field to make this less fragile... right - * now I don'w know how to coerce 'C' to give me the offset info + * now I don't know how to coerce 'C' to give me the offset info * that's needed for an AtomicCompareAndSwap */ m->pmapped = TRUE; + if (prot & VM_PROT_WRITE) { + vm_object_lock_assert_exclusive(m->object); + m->wpmapped = TRUE; + if(must_disconnect) { + /* We can only get here + * because of the CSE logic */ + assert(cs_enforcement_disable == FALSE); + pmap_disconnect(m->phys_page); + /* If we are faulting for a write, we can clear + * the execute bit - that will ensure the page is + * checked again before being executable, which + * protects against a map switch. + * This only happens the first time the page + * gets tainted, so we won't get stuck here + * to make an already writeable page executable. */ + prot &= ~VM_PROT_EXECUTE; + } + } + + /* Prevent a deadlock by not + * holding the object lock if we need to wait for a page in + * pmap_enter() - */ + PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, cache_attr, + wired, PMAP_OPTIONS_NOWAIT, pe_result); + + if(pe_result == KERN_RESOURCE_SHORTAGE) { + /* The nonblocking version of pmap_enter did not succeed. + * Use the blocking version instead. Requires marking + * the page busy and unlocking the object */ + boolean_t was_busy = m->busy; + m->busy = TRUE; + vm_object_unlock(m->object); + + PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired); - PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired); + /* Take the object lock again. */ + vm_object_lock(m->object); + + /* If the page was busy, someone else will wake it up. + * Otherwise, we have to do it now. */ + assert(m->busy); + if(!was_busy) { + PAGE_WAKEUP_DONE(m); + } + vm_pmap_enter_blocked++; + } } /* @@ -2104,22 +2448,67 @@ vm_fault_enter(vm_page_t m, vm_page_wire(m); } } else { - vm_page_unwire(m); + vm_page_unwire(m, TRUE); } vm_page_unlock_queues(); } else { if (kr != KERN_SUCCESS) { - vm_page_lock_queues(); + vm_page_lockspin_queues(); vm_page_deactivate(m); vm_page_unlock_queues(); } else { - if (((!m->active && !m->inactive) || no_cache) && !m->wire_count && !m->throttled) { + if (((!m->active && !m->inactive) || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) { + + if ( vm_page_local_q && !no_cache && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) { + struct vpl *lq; + uint32_t lid; + + /* + * we got a local queue to stuff this new page on... + * its safe to manipulate local and local_id at this point + * since we're behind an exclusive object lock and the + * page is not on any global queue. + * + * we'll use the current cpu number to select the queue + * note that we don't need to disable preemption... we're + * going to behind the local queue's lock to do the real + * work + */ + lid = cpu_number(); + + lq = &vm_page_local_q[lid].vpl_un.vpl; + + VPL_LOCK(&lq->vpl_lock); + + queue_enter(&lq->vpl_queue, m, vm_page_t, pageq); + m->local = TRUE; + m->local_id = lid; + lq->vpl_count++; + + VPL_UNLOCK(&lq->vpl_lock); + + if (lq->vpl_count > vm_page_local_q_soft_limit) { + /* + * we're beyond the soft limit for the local queue + * vm_page_reactivate_local will 'try' to take + * the global page queue lock... if it can't that's + * ok... we'll let the queue continue to grow up + * to the hard limit... at that point we'll wait + * for the lock... once we've got the lock, we'll + * transfer all of the pages from the local queue + * to the global active queue + */ + vm_page_reactivate_local(lid, FALSE, FALSE); + } + return kr; + } + vm_page_lockspin_queues(); /* * test again now that we hold the page queue lock */ - if (((!m->active && !m->inactive) || no_cache) && !m->wire_count) { + if (((!m->active && !m->inactive) || no_cache) && !VM_PAGE_WIRED(m)) { /* * If this is a no_cache mapping and the page has never been @@ -2206,6 +2595,7 @@ vm_fault( boolean_t need_collapse = FALSE; int object_lock_type = 0; int cur_object_lock_type; + vm_object_t top_object = VM_OBJECT_NULL; KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START, @@ -2225,6 +2615,7 @@ vm_fault( return (KERN_FAILURE); } + interruptible_state = thread_interrupt_level(interruptible); VM_STAT_INCR(faults); @@ -2266,6 +2657,8 @@ RetryFault: } pmap = real_map->pmap; fault_info.interruptible = interruptible; + fault_info.stealth = FALSE; + fault_info.mark_zf_absent = FALSE; /* * If the page is wired, we must fault for the current protection @@ -2273,7 +2666,6 @@ RetryFault: */ if (wired) { fault_type = prot | VM_PROT_WRITE; - /* * since we're treating this fault as a 'write' * we must hold the top object lock exclusively @@ -2344,6 +2736,18 @@ RetryFault: cur_offset = offset; while (TRUE) { + if (!cur_object->pager_created && + cur_object->phys_contiguous) /* superpage */ + break; + + if (cur_object->blocked_access) { + /* + * Access to this VM object has been blocked. + * Let the slow path handle it. + */ + break; + } + m = vm_page_lookup(cur_object, cur_offset); if (m != VM_PAGE_NULL) { @@ -2426,6 +2830,17 @@ RetryFault: */ break; } + if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) { + if (object != cur_object) + vm_object_unlock(object); + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + vm_object_unlock(cur_object); + kr = KERN_MEMORY_ERROR; + goto done; + } + if (m->encrypted) { /* * ENCRYPTED SWAP: @@ -2500,9 +2915,9 @@ RetryFault: } ASSERT_PAGE_DECRYPTED(m); - if (m->object->code_signed && !m->cs_validated) { + if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) { /* - * We will need to validate this page + * We might need to validate this page * against its code signature, so we * want to hold the VM object exclusively. */ @@ -2547,23 +2962,48 @@ RetryFault: * --> must disallow write. */ - if (object == cur_object && object->copy == VM_OBJECT_NULL) + if (object == cur_object && object->copy == VM_OBJECT_NULL) { + if ((fault_type & VM_PROT_WRITE) == 0) { + /* + * This is not a "write" fault, so we + * might not have taken the object lock + * exclusively and we might not be able + * to update the "wpmapped" bit in + * vm_fault_enter(). + * Let's just grant read access to + * the page for now and we'll + * soft-fault again if we need write + * access later... + */ + prot &= ~VM_PROT_WRITE; + } goto FastPmapEnter; + } if ((fault_type & VM_PROT_WRITE) == 0) { prot &= ~VM_PROT_WRITE; - /* - * Set up to map the page... - * mark the page busy, drop - * unneeded object lock - */ if (object != cur_object) { - /* - * don't need the original object anymore + /* + * We still need to hold the top object + * lock here to prevent a race between + * a read fault (taking only "shared" + * locks) and a write fault (taking + * an "exclusive" lock on the top + * object. + * Otherwise, as soon as we release the + * top lock, the write fault could + * proceed and actually complete before + * the read fault, and the copied page's + * translation could then be overwritten + * by the read fault's translation for + * the original page. + * + * Let's just record what the top object + * is and we'll release it later. */ - vm_object_unlock(object); + top_object = object; /* * switch to the object that has the new page @@ -2604,10 +3044,24 @@ FastPmapEnter: &type_of_fault); } + if (top_object != VM_OBJECT_NULL) { + /* + * It's safe to drop the top object + * now that we've done our + * vm_fault_enter(). Any other fault + * in progress for that virtual + * address will either find our page + * and translation or put in a new page + * and translation. + */ + vm_object_unlock(top_object); + top_object = VM_OBJECT_NULL; + } + if (need_collapse == TRUE) vm_object_collapse(object, offset, TRUE); - if (type_of_fault == DBG_PAGEIN_FAULT) { + if (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT) { /* * evaluate access pattern and update state * vm_fault_deactivate_behind depends on the @@ -2633,7 +3087,34 @@ FastPmapEnter: } /* * COPY ON WRITE FAULT - * + */ + assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE); + + if (vm_page_throttled()) { + /* + * drop all of our locks... + * wait until the free queue is + * pumped back up and then + * redrive the fault + */ + if (object != cur_object) + vm_object_unlock(cur_object); + vm_object_unlock(object); + vm_map_unlock_read(map); + if (real_map != map) + vm_map_unlock(real_map); + + if (NEED_TO_HARD_THROTTLE_THIS_TASK()) + delay(HARD_THROTTLE_DELAY); + + if (!current_thread_aborted() && vm_page_wait((change_wiring) ? + THREAD_UNINT : + THREAD_ABORTSAFE)) + goto RetryFault; + kr = KERN_ABORTED; + goto done; + } + /* * If objects match, then * object->copy must not be NULL (else control * would be in previous code block), and we @@ -2647,8 +3128,6 @@ FastPmapEnter: */ break; } - assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE); - /* * This is now a shadow based copy on write * fault -- it requires a copy up the shadow @@ -2753,8 +3232,9 @@ FastPmapEnter: * Zero fill fault. Page gets * inserted into the original object. */ - if (cur_object->shadow_severed) { - + if (cur_object->shadow_severed || + VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object)) + { if (object != cur_object) vm_object_unlock(cur_object); vm_object_unlock(object); @@ -2766,7 +3246,7 @@ FastPmapEnter: kr = KERN_MEMORY_ERROR; goto done; } - if (VM_PAGE_ZFILL_THROTTLED()) { + if (vm_page_throttled()) { /* * drop all of our locks... * wait until the free queue is @@ -2780,11 +3260,13 @@ FastPmapEnter: if (real_map != map) vm_map_unlock(real_map); - if (vm_page_wait((change_wiring) ? + if (NEED_TO_HARD_THROTTLE_THIS_TASK()) + delay(HARD_THROTTLE_DELAY); + + if (!current_thread_aborted() && vm_page_wait((change_wiring) ? THREAD_UNINT : THREAD_ABORTSAFE)) goto RetryFault; - kr = KERN_ABORTED; goto done; } @@ -2925,14 +3407,14 @@ handle_copy_delay: * if kr == VM_FAULT_SUCCESS, then the paging reference * is still held along with the ref_count on the original object * - * if m != NULL, then the object it belongs to - * is returned locked with a paging reference + * the object is returned locked with a paging reference * * if top_page != NULL, then it's BUSY and the * object it belongs to has a paging reference * but is returned unlocked */ - if (kr != VM_FAULT_SUCCESS) { + if (kr != VM_FAULT_SUCCESS && + kr != VM_FAULT_SUCCESS_NO_VM_PAGE) { /* * we didn't succeed, lose the object reference immediately. */ @@ -2961,6 +3443,9 @@ handle_copy_delay: else kr = KERN_MEMORY_ERROR; goto done; + default: + panic("vm_fault: unexpected error 0x%x from " + "vm_fault_page()\n", kr); } } m = result_page; @@ -2978,10 +3463,12 @@ handle_copy_delay: #define RELEASE_PAGE(m) \ MACRO_BEGIN \ PAGE_WAKEUP_DONE(m); \ - vm_page_lockspin_queues(); \ - if (!m->active && !m->inactive && !m->throttled)\ - vm_page_activate(m); \ - vm_page_unlock_queues(); \ + if (!m->active && !m->inactive && !m->throttled) { \ + vm_page_lockspin_queues(); \ + if (!m->active && !m->inactive && !m->throttled) \ + vm_page_activate(m); \ + vm_page_unlock_queues(); \ + } \ MACRO_END /* @@ -2991,8 +3478,10 @@ handle_copy_delay: if (m != VM_PAGE_NULL) { old_copy_object = m->object->copy; vm_object_unlock(m->object); - } else + } else { old_copy_object = VM_OBJECT_NULL; + vm_object_unlock(object); + } /* * no object locks are held at this point @@ -3238,26 +3727,29 @@ handle_copy_delay: (entry->object.vm_object != NULL) && (entry->object.vm_object == object)) { + int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0; if (caller_pmap) { /* * Set up a block mapped area */ + assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12)); pmap_map_block(caller_pmap, (addr64_t)(caller_pmap_addr - ldelta), - (((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) + - entry->offset + (laddr - entry->vme_start) - ldelta) >> 12, - ((ldelta + hdelta) >> 12), prot, - (VM_WIMG_MASK & (int)object->wimg_bits), 0); + (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) + + entry->offset + (laddr - entry->vme_start) - ldelta) >> 12), + (uint32_t)((ldelta + hdelta) >> 12), prot, + (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0); } else { /* * Set up a block mapped area */ + assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12)); pmap_map_block(real_map->pmap, (addr64_t)(vaddr - ldelta), - (((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) + - entry->offset + (laddr - entry->vme_start) - ldelta) >> 12, - ((ldelta + hdelta) >> 12), prot, - (VM_WIMG_MASK & (int)object->wimg_bits), 0); + (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) + + entry->offset + (laddr - entry->vme_start) - ldelta) >> 12), + (uint32_t)((ldelta + hdelta) >> 12), prot, + (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0); } } } @@ -3395,6 +3887,8 @@ vm_fault_unwire( fault_info.lo_offset = entry->offset; fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset; fault_info.no_cache = entry->no_cache; + fault_info.stealth = TRUE; + fault_info.mark_zf_absent = FALSE; /* * Since the pages are wired down, we must be able to @@ -3403,11 +3897,11 @@ vm_fault_unwire( for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { - if (pmap) { - pmap_change_wiring(pmap, - pmap_addr + (va - entry->vme_start), FALSE); - } if (object == VM_OBJECT_NULL) { + if (pmap) { + pmap_change_wiring(pmap, + pmap_addr + (va - entry->vme_start), FALSE); + } (void) vm_fault(map, va, VM_PROT_NONE, TRUE, THREAD_UNINT, pmap, pmap_addr); } else { @@ -3417,7 +3911,13 @@ vm_fault_unwire( vm_object_t result_object; vm_fault_return_t result; - fault_info.cluster_size = end_addr - va; + if (end_addr - va > (vm_size_t) -1) { + /* 32-bit overflow */ + fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE); + } else { + fault_info.cluster_size = (vm_size_t) (end_addr - va); + assert(fault_info.cluster_size == end_addr - va); + } do { prot = VM_PROT_NONE; @@ -3455,15 +3955,26 @@ vm_fault_unwire( result_object = result_page->object; + if ((pmap) && (result_page->phys_page != vm_page_guard_addr)) { + pmap_change_wiring(pmap, + pmap_addr + (va - entry->vme_start), FALSE); + } if (deallocate) { assert(result_page->phys_page != vm_page_fictitious_addr); pmap_disconnect(result_page->phys_page); VM_PAGE_FREE(result_page); } else { - vm_page_lockspin_queues(); - vm_page_unwire(result_page); - vm_page_unlock_queues(); + if (VM_PAGE_WIRED(result_page)) { + vm_page_lockspin_queues(); + vm_page_unwire(result_page, TRUE); + vm_page_unlock_queues(); + } + if(entry->zero_wired_pages) { + pmap_zero_page(result_page->phys_page); + entry->zero_wired_pages = FALSE; + } + PAGE_WAKEUP_DONE(result_page); } vm_fault_cleanup(result_object, top_page); @@ -3530,7 +4041,7 @@ vm_fault_wire_fast( #define RELEASE_PAGE(m) { \ PAGE_WAKEUP_DONE(m); \ vm_page_lockspin_queues(); \ - vm_page_unwire(m); \ + vm_page_unwire(m, TRUE); \ vm_page_unlock_queues(); \ } @@ -3681,10 +4192,12 @@ vm_fault_copy_cleanup( vm_object_lock(object); PAGE_WAKEUP_DONE(page); - vm_page_lockspin_queues(); - if (!page->active && !page->inactive && !page->throttled) - vm_page_activate(page); - vm_page_unlock_queues(); + if (!page->active && !page->inactive && !page->throttled) { + vm_page_lockspin_queues(); + if (!page->active && !page->inactive && !page->throttled) + vm_page_activate(page); + vm_page_unlock_queues(); + } vm_fault_cleanup(object, top_page); } @@ -3698,7 +4211,7 @@ vm_fault_copy_dst_cleanup( object = page->object; vm_object_lock(object); vm_page_lockspin_queues(); - vm_page_unwire(page); + vm_page_unwire(page, TRUE); vm_page_unlock_queues(); vm_object_paging_end(object); vm_object_unlock(object); @@ -3756,6 +4269,7 @@ vm_fault_copy( vm_map_size_t amount_left; vm_object_t old_copy_object; kern_return_t error = 0; + vm_fault_return_t result; vm_map_size_t part_size; struct vm_object_fault_info fault_info_src; @@ -3780,6 +4294,8 @@ vm_fault_copy( fault_info_src.lo_offset = vm_object_trunc_page(src_offset); fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left; fault_info_src.no_cache = FALSE; + fault_info_src.stealth = TRUE; + fault_info_src.mark_zf_absent = FALSE; fault_info_dst.interruptible = interruptible; fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL; @@ -3787,6 +4303,8 @@ vm_fault_copy( fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset); fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left; fault_info_dst.no_cache = FALSE; + fault_info_dst.stealth = TRUE; + fault_info_dst.mark_zf_absent = FALSE; do { /* while (amount_left > 0) */ /* @@ -3803,18 +4321,25 @@ vm_fault_copy( vm_object_lock(dst_object); vm_object_paging_begin(dst_object); - fault_info_dst.cluster_size = amount_left; + if (amount_left > (vm_size_t) -1) { + /* 32-bit overflow */ + fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE); + } else { + fault_info_dst.cluster_size = (vm_size_t) amount_left; + assert(fault_info_dst.cluster_size == amount_left); + } XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0); - switch (vm_fault_page(dst_object, - vm_object_trunc_page(dst_offset), - VM_PROT_WRITE|VM_PROT_READ, - FALSE, - &dst_prot, &dst_page, &dst_top_page, - (int *)0, - &error, - dst_map->no_zero_fill, - FALSE, &fault_info_dst)) { + result = vm_fault_page(dst_object, + vm_object_trunc_page(dst_offset), + VM_PROT_WRITE|VM_PROT_READ, + FALSE, + &dst_prot, &dst_page, &dst_top_page, + (int *)0, + &error, + dst_map->no_zero_fill, + FALSE, &fault_info_dst); + switch (result) { case VM_FAULT_SUCCESS: break; case VM_FAULT_RETRY: @@ -3825,11 +4350,19 @@ vm_fault_copy( /* fall thru */ case VM_FAULT_INTERRUPTED: RETURN(MACH_SEND_INTERRUPTED); + case VM_FAULT_SUCCESS_NO_VM_PAGE: + /* success but no VM page: fail the copy */ + vm_object_paging_end(dst_object); + vm_object_unlock(dst_object); + /*FALLTHROUGH*/ case VM_FAULT_MEMORY_ERROR: if (error) return (error); else return(KERN_MEMORY_ERROR); + default: + panic("vm_fault_copy: unexpected error 0x%x from " + "vm_fault_page()\n", result); } assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE); @@ -3880,20 +4413,27 @@ vm_fault_copy( src_prot = VM_PROT_READ; vm_object_paging_begin(src_object); - fault_info_src.cluster_size = amount_left; + if (amount_left > (vm_size_t) -1) { + /* 32-bit overflow */ + fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE); + } else { + fault_info_src.cluster_size = (vm_size_t) amount_left; + assert(fault_info_src.cluster_size == amount_left); + } XPR(XPR_VM_FAULT, "vm_fault_copy(2) -> vm_fault_page\n", 0,0,0,0,0); - switch (vm_fault_page( - src_object, - vm_object_trunc_page(src_offset), - VM_PROT_READ, FALSE, - &src_prot, - &result_page, &src_top_page, - (int *)0, &error, FALSE, - FALSE, &fault_info_src)) { - + result = vm_fault_page( + src_object, + vm_object_trunc_page(src_offset), + VM_PROT_READ, FALSE, + &src_prot, + &result_page, &src_top_page, + (int *)0, &error, FALSE, + FALSE, &fault_info_src); + + switch (result) { case VM_FAULT_SUCCESS: break; case VM_FAULT_RETRY: @@ -3905,12 +4445,21 @@ vm_fault_copy( case VM_FAULT_INTERRUPTED: vm_fault_copy_dst_cleanup(dst_page); RETURN(MACH_SEND_INTERRUPTED); + case VM_FAULT_SUCCESS_NO_VM_PAGE: + /* success but no VM page: fail */ + vm_object_paging_end(src_object); + vm_object_unlock(src_object); + /*FALLTHROUGH*/ case VM_FAULT_MEMORY_ERROR: vm_fault_copy_dst_cleanup(dst_page); if (error) return (error); else return(KERN_MEMORY_ERROR); + default: + panic("vm_fault_copy(2): unexpected " + "error 0x%x from " + "vm_fault_page()\n", result); } @@ -3965,11 +4514,20 @@ vm_fault_copy( } if (result_page == VM_PAGE_NULL) { + assert((vm_offset_t) dst_po == dst_po); + assert((vm_size_t) part_size == part_size); vm_page_part_zero_fill(dst_page, - dst_po, part_size); + (vm_offset_t) dst_po, + (vm_size_t) part_size); } else { - vm_page_part_copy(result_page, src_po, - dst_page, dst_po, part_size); + assert((vm_offset_t) src_po == src_po); + assert((vm_offset_t) dst_po == dst_po); + assert((vm_size_t) part_size == part_size); + vm_page_part_copy(result_page, + (vm_offset_t) src_po, + dst_page, + (vm_offset_t) dst_po, + (vm_size_t)part_size); if(!dst_page->dirty){ vm_object_lock(dst_object); dst_page->dirty = TRUE; @@ -4102,6 +4660,89 @@ vm_fault_classify_init(void) extern int cs_validation; +void +vm_page_validate_cs_mapped( + vm_page_t page, + const void *kaddr) +{ + vm_object_t object; + vm_object_offset_t offset; + kern_return_t kr; + memory_object_t pager; + void *blobs; + boolean_t validated, tainted; + + assert(page->busy); + vm_object_lock_assert_exclusive(page->object); + + if (!cs_validation) { + return; + } + + if (page->wpmapped && !page->cs_tainted) { + /* + * This page was mapped for "write" access sometime in the + * past and could still be modifiable in the future. + * Consider it tainted. + * [ If the page was already found to be "tainted", no + * need to re-validate. ] + */ + page->cs_validated = TRUE; + page->cs_tainted = TRUE; + if (cs_debug) { + printf("CODESIGNING: vm_page_validate_cs: " + "page %p obj %p off 0x%llx " + "was modified\n", + page, page->object, page->offset); + } + vm_cs_validated_dirtied++; + } + + if (page->cs_validated) { + return; + } + + vm_cs_validates++; + + object = page->object; + assert(object->code_signed); + offset = page->offset; + + if (!object->alive || object->terminating || object->pager == NULL) { + /* + * The object is terminating and we don't have its pager + * so we can't validate the data... + */ + return; + } + /* + * Since we get here to validate a page that was brought in by + * the pager, we know that this pager is all setup and ready + * by now. + */ + assert(!object->internal); + assert(object->pager != NULL); + assert(object->pager_ready); + + pager = object->pager; + assert(object->paging_in_progress); + kr = vnode_pager_get_object_cs_blobs(pager, &blobs); + if (kr != KERN_SUCCESS) { + blobs = NULL; + } + + /* verify the SHA1 hash for this page */ + validated = cs_validate_page(blobs, + offset + object->paging_offset, + (const void *)kaddr, + &tainted); + + page->cs_validated = validated; + if (validated) { + page->cs_tainted = tainted; + } +} + void vm_page_validate_cs( vm_page_t page) @@ -4112,18 +4753,41 @@ vm_page_validate_cs( vm_map_size_t ksize; vm_offset_t kaddr; kern_return_t kr; - memory_object_t pager; - void *blobs; - boolean_t validated, tainted; boolean_t busy_page; - vm_object_lock_assert_exclusive(page->object); - assert(!page->cs_validated); + vm_object_lock_assert_held(page->object); if (!cs_validation) { return; } + if (page->wpmapped && !page->cs_tainted) { + vm_object_lock_assert_exclusive(page->object); + + /* + * This page was mapped for "write" access sometime in the + * past and could still be modifiable in the future. + * Consider it tainted. + * [ If the page was already found to be "tainted", no + * need to re-validate. ] + */ + page->cs_validated = TRUE; + page->cs_tainted = TRUE; + if (cs_debug) { + printf("CODESIGNING: vm_page_validate_cs: " + "page %p obj %p off 0x%llx " + "was modified\n", + page, page->object, page->offset); + } + vm_cs_validated_dirtied++; + } + + if (page->cs_validated) { + return; + } + + vm_object_lock_assert_exclusive(page->object); + object = page->object; assert(object->code_signed); offset = page->offset; @@ -4149,53 +4813,20 @@ vm_page_validate_cs( object, offset, &ksize, + VM_PROT_READ, FALSE); /* can't unlock object ! */ if (kr != KERN_SUCCESS) { panic("vm_page_validate_cs: could not map page: 0x%x\n", kr); } kaddr = CAST_DOWN(vm_offset_t, koffset); - /* - * Since we get here to validate a page that was brought in by - * the pager, we know that this pager is all setup and ready - * by now. - */ - assert(!object->internal); - assert(object->pager != NULL); - assert(object->pager_ready); - - if (!object->alive || object->terminating || object->pager == NULL) { - /* - * The object is terminating and we don't have its pager - * so we can't validate the data... - */ - goto out; - } - - pager = object->pager; - assert(pager != NULL); - - kr = vnode_pager_get_object_cs_blobs(pager, &blobs); - if (kr != KERN_SUCCESS) { - blobs = NULL; - } - - /* verify the SHA1 hash for this page */ - validated = cs_validate_page(blobs, - offset + object->paging_offset, - (const void *)kaddr, - &tainted); + /* validate the mapped page */ + vm_page_validate_cs_mapped(page, (const void *) kaddr); assert(page->busy); assert(object == page->object); vm_object_lock_assert_exclusive(object); - page->cs_validated = validated; - if (validated) { - page->cs_tainted = tainted; - } - -out: if (!busy_page) { PAGE_WAKEUP_DONE(page); }